diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 02:25:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 02:25:50 +0000 |
commit | 19f4f86bfed21c5326ed2acebe1163f3a83e832b (patch) | |
tree | d59b9989ce55ed23693e80974d94c856f1c2c8b1 /src/core | |
parent | Initial commit. (diff) | |
download | systemd-upstream.tar.xz systemd-upstream.zip |
Adding upstream version 241.upstream/241upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
145 files changed, 74883 insertions, 0 deletions
diff --git a/src/core/all-units.h b/src/core/all-units.h new file mode 100644 index 0000000..ed8350e --- /dev/null +++ b/src/core/all-units.h @@ -0,0 +1,14 @@ +#pragma once + +#include "unit.h" + +#include "automount.h" +#include "device.h" +#include "path.h" +#include "scope.h" +#include "service.h" +#include "slice.h" +#include "socket.h" +#include "swap.h" +#include "target.h" +#include "timer.h" diff --git a/src/core/audit-fd.c b/src/core/audit-fd.c new file mode 100644 index 0000000..fdef433 --- /dev/null +++ b/src/core/audit-fd.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> + +#include "audit-fd.h" + +#if HAVE_AUDIT + +#include <libaudit.h> +#include <stdbool.h> + +#include "capability-util.h" +#include "fd-util.h" +#include "log.h" +#include "util.h" + +static bool initialized = false; +static int audit_fd; + +int get_audit_fd(void) { + + if (!initialized) { + if (have_effective_cap(CAP_AUDIT_WRITE) == 0) { + audit_fd = -EPERM; + initialized = true; + + return audit_fd; + } + + audit_fd = audit_open(); + + if (audit_fd < 0) { + if (!IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT)) + log_error_errno(errno, "Failed to connect to audit log: %m"); + + audit_fd = errno ? -errno : -EINVAL; + } + + initialized = true; + } + + return audit_fd; +} + +void close_audit_fd(void) { + + if (initialized && audit_fd >= 0) + safe_close(audit_fd); + + initialized = true; + audit_fd = -ECONNRESET; +} + +#else + +int get_audit_fd(void) { + return -EAFNOSUPPORT; +} + +void close_audit_fd(void) { +} + +#endif diff --git a/src/core/audit-fd.h b/src/core/audit-fd.h new file mode 100644 index 0000000..8c1e471 --- /dev/null +++ b/src/core/audit-fd.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int get_audit_fd(void); +void close_audit_fd(void); diff --git a/src/core/automount.c b/src/core/automount.c new file mode 100644 index 0000000..6a83739 --- /dev/null +++ b/src/core/automount.c @@ -0,0 +1,1144 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/auto_dev-ioctl.h> +#include <linux/auto_fs4.h> +#include <sys/epoll.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "async.h" +#include "automount.h" +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-unit.h" +#include "fd-util.h" +#include "format-util.h" +#include "io-util.h" +#include "label.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = { + [AUTOMOUNT_DEAD] = UNIT_INACTIVE, + [AUTOMOUNT_WAITING] = UNIT_ACTIVE, + [AUTOMOUNT_RUNNING] = UNIT_ACTIVE, + [AUTOMOUNT_FAILED] = UNIT_FAILED +}; + +struct expire_data { + int dev_autofs_fd; + int ioctl_fd; +}; + +static void expire_data_free(struct expire_data *data) { + if (!data) + return; + + safe_close(data->dev_autofs_fd); + safe_close(data->ioctl_fd); + free(data); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free); + +static int open_dev_autofs(Manager *m); +static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata); +static int automount_start_expire(Automount *a); +static void automount_stop_expire(Automount *a); +static int automount_send_ready(Automount *a, Set *tokens, int status); + +static void automount_init(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + a->pipe_fd = -1; + a->directory_mode = 0755; + UNIT(a)->ignore_on_isolate = true; +} + +static void unmount_autofs(Automount *a) { + int r; + + assert(a); + + if (a->pipe_fd < 0) + return; + + a->pipe_event_source = sd_event_source_unref(a->pipe_event_source); + a->pipe_fd = safe_close(a->pipe_fd); + + /* If we reload/reexecute things we keep the mount point around */ + if (!IN_SET(UNIT(a)->manager->objective, MANAGER_RELOAD, MANAGER_REEXECUTE)) { + + automount_send_ready(a, a->tokens, -EHOSTDOWN); + automount_send_ready(a, a->expire_tokens, -EHOSTDOWN); + + if (a->where) { + r = repeat_unmount(a->where, MNT_DETACH); + if (r < 0) + log_error_errno(r, "Failed to unmount: %m"); + } + } +} + +static void automount_done(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + unmount_autofs(a); + + a->where = mfree(a->where); + + a->tokens = set_free(a->tokens); + a->expire_tokens = set_free(a->expire_tokens); + + a->expire_event_source = sd_event_source_unref(a->expire_event_source); +} + +static int automount_add_trigger_dependencies(Automount *a) { + Unit *x; + int r; + + assert(a); + + r = unit_load_related_unit(UNIT(a), ".mount", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(a), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int automount_add_mount_dependencies(Automount *a) { + _cleanup_free_ char *parent = NULL; + + assert(a); + + parent = dirname_malloc(a->where); + if (!parent) + return -ENOMEM; + + return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT); +} + +static int automount_add_default_dependencies(Automount *a) { + int r; + + assert(a); + + if (!UNIT(a)->default_dependencies) + return 0; + + if (!MANAGER_IS_SYSTEM(UNIT(a)->manager)) + return 0; + + r = unit_add_two_dependencies_by_name(UNIT(a), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int automount_verify(Automount *a) { + _cleanup_free_ char *e = NULL; + int r; + + assert(a); + + if (UNIT(a)->load_state != UNIT_LOADED) + return 0; + + if (path_equal(a->where, "/")) { + log_unit_error(UNIT(a), "Cannot have an automount unit for the root directory. Refusing."); + return -ENOEXEC; + } + + r = unit_name_from_path(a->where, ".automount", &e); + if (r < 0) + return log_unit_error_errno(UNIT(a), r, "Failed to generate unit name from path: %m"); + + if (!unit_has_name(UNIT(a), e)) { + log_unit_error(UNIT(a), "Where= setting doesn't match unit name. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int automount_set_where(Automount *a) { + int r; + + assert(a); + + if (a->where) + return 0; + + r = unit_name_to_path(UNIT(a)->id, &a->where); + if (r < 0) + return r; + + path_simplify(a->where, false); + return 1; +} + +static int automount_load(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + /* Load a .automount file */ + r = unit_load_fragment_and_dropin(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_LOADED) { + r = automount_set_where(a); + if (r < 0) + return r; + + r = automount_add_trigger_dependencies(a); + if (r < 0) + return r; + + r = automount_add_mount_dependencies(a); + if (r < 0) + return r; + + r = automount_add_default_dependencies(a); + if (r < 0) + return r; + } + + return automount_verify(a); +} + +static void automount_set_state(Automount *a, AutomountState state) { + AutomountState old_state; + assert(a); + + if (a->state != state) + bus_unit_send_pending_change_signal(UNIT(a), false); + + old_state = a->state; + a->state = state; + + if (state != AUTOMOUNT_RUNNING) + automount_stop_expire(a); + + if (!IN_SET(state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) + unmount_autofs(a); + + if (state != old_state) + log_unit_debug(UNIT(a), "Changed %s -> %s", automount_state_to_string(old_state), automount_state_to_string(state)); + + unit_notify(UNIT(a), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int automount_coldplug(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(a->state == AUTOMOUNT_DEAD); + + if (a->deserialized_state == a->state) + return 0; + + if (IN_SET(a->deserialized_state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) { + + r = automount_set_where(a); + if (r < 0) + return r; + + r = open_dev_autofs(u->manager); + if (r < 0) + return r; + + assert(a->pipe_fd >= 0); + + r = sd_event_add_io(u->manager->event, &a->pipe_event_source, a->pipe_fd, EPOLLIN, automount_dispatch_io, u); + if (r < 0) + return r; + + (void) sd_event_source_set_description(a->pipe_event_source, "automount-io"); + if (a->deserialized_state == AUTOMOUNT_RUNNING) { + r = automount_start_expire(a); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m"); + } + + automount_set_state(a, a->deserialized_state); + } + + return 0; +} + +static void automount_dump(Unit *u, FILE *f, const char *prefix) { + char time_string[FORMAT_TIMESPAN_MAX]; + Automount *a = AUTOMOUNT(u); + + assert(a); + + fprintf(f, + "%sAutomount State: %s\n" + "%sResult: %s\n" + "%sWhere: %s\n" + "%sDirectoryMode: %04o\n" + "%sTimeoutIdleUSec: %s\n", + prefix, automount_state_to_string(a->state), + prefix, automount_result_to_string(a->result), + prefix, a->where, + prefix, a->directory_mode, + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, a->timeout_idle_usec, USEC_PER_SEC)); +} + +static void automount_enter_dead(Automount *a, AutomountResult f) { + assert(a); + + if (a->result == AUTOMOUNT_SUCCESS) + a->result = f; + + unit_log_result(UNIT(a), a->result == AUTOMOUNT_SUCCESS, automount_result_to_string(a->result)); + automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD); +} + +static int open_dev_autofs(Manager *m) { + struct autofs_dev_ioctl param; + + assert(m); + + if (m->dev_autofs_fd >= 0) + return m->dev_autofs_fd; + + (void) label_fix("/dev/autofs", 0); + + m->dev_autofs_fd = open("/dev/autofs", O_CLOEXEC|O_RDONLY); + if (m->dev_autofs_fd < 0) + return log_error_errno(errno, "Failed to open /dev/autofs: %m"); + + init_autofs_dev_ioctl(¶m); + if (ioctl(m->dev_autofs_fd, AUTOFS_DEV_IOCTL_VERSION, ¶m) < 0) { + m->dev_autofs_fd = safe_close(m->dev_autofs_fd); + return -errno; + } + + log_debug("Autofs kernel version %i.%i", param.ver_major, param.ver_minor); + + return m->dev_autofs_fd; +} + +static int open_ioctl_fd(int dev_autofs_fd, const char *where, dev_t devid) { + struct autofs_dev_ioctl *param; + size_t l; + + assert(dev_autofs_fd >= 0); + assert(where); + + l = sizeof(struct autofs_dev_ioctl) + strlen(where) + 1; + param = alloca(l); + + init_autofs_dev_ioctl(param); + param->size = l; + param->ioctlfd = -1; + param->openmount.devid = devid; + strcpy(param->path, where); + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0) + return -errno; + + if (param->ioctlfd < 0) + return -EIO; + + (void) fd_cloexec(param->ioctlfd, true); + return param->ioctlfd; +} + +static int autofs_protocol(int dev_autofs_fd, int ioctl_fd) { + uint32_t major, minor; + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOVER, ¶m) < 0) + return -errno; + + major = param.protover.version; + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOSUBVER, ¶m) < 0) + return -errno; + + minor = param.protosubver.sub_version; + + log_debug("Autofs protocol version %i.%i", major, minor); + return 0; +} + +static int autofs_set_timeout(int dev_autofs_fd, int ioctl_fd, usec_t usec) { + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (usec == USEC_INFINITY) + param.timeout.timeout = 0; + else + /* Convert to seconds, rounding up. */ + param.timeout.timeout = DIV_ROUND_UP(usec, USEC_PER_SEC); + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_TIMEOUT, ¶m) < 0) + return -errno; + + return 0; +} + +static int autofs_send_ready(int dev_autofs_fd, int ioctl_fd, uint32_t token, int status) { + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (status != 0) { + param.fail.token = token; + param.fail.status = status; + } else + param.ready.token = token; + + if (ioctl(dev_autofs_fd, status ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, ¶m) < 0) + return -errno; + + return 0; +} + +static int automount_send_ready(Automount *a, Set *tokens, int status) { + _cleanup_close_ int ioctl_fd = -1; + unsigned token; + int r; + + assert(a); + assert(status <= 0); + + if (set_isempty(tokens)) + return 0; + + ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (ioctl_fd < 0) + return ioctl_fd; + + if (status != 0) + log_unit_debug_errno(UNIT(a), status, "Sending failure: %m"); + else + log_unit_debug(UNIT(a), "Sending success."); + + r = 0; + + /* Autofs thankfully does not hand out 0 as a token */ + while ((token = PTR_TO_UINT(set_steal_first(tokens)))) { + int k; + + /* Autofs fun fact: + * + * if you pass a positive status code here, kernels + * prior to 4.12 will freeze! Yay! */ + + k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd, + ioctl_fd, + token, + status); + if (k < 0) + r = k; + } + + return r; +} + +static void automount_trigger_notify(Unit *u, Unit *other) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(other); + + /* Filter out invocations with bogus state */ + if (other->load_state != UNIT_LOADED || other->type != UNIT_MOUNT) + return; + + /* Don't propagate state changes from the mount if we are already down */ + if (!IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + automount_enter_dead(a, AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + /* The mount is successfully established */ + if (IN_SET(MOUNT(other)->state, MOUNT_MOUNTED, MOUNT_REMOUNTING)) { + (void) automount_send_ready(a, a->tokens, 0); + + r = automount_start_expire(a); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m"); + + automount_set_state(a, AUTOMOUNT_RUNNING); + } + + if (IN_SET(MOUNT(other)->state, + MOUNT_MOUNTING, MOUNT_MOUNTING_DONE, + MOUNT_MOUNTED, MOUNT_REMOUNTING, + MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL, + MOUNT_FAILED)) { + + (void) automount_send_ready(a, a->expire_tokens, -ENODEV); + } + + if (MOUNT(other)->state == MOUNT_DEAD) + (void) automount_send_ready(a, a->expire_tokens, 0); + + /* The mount is in some unhappy state now, let's unfreeze any waiting clients */ + if (IN_SET(MOUNT(other)->state, + MOUNT_DEAD, MOUNT_UNMOUNTING, + MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL, + MOUNT_FAILED)) { + + (void) automount_send_ready(a, a->tokens, -ENODEV); + + automount_set_state(a, AUTOMOUNT_WAITING); + } +} + +static void automount_enter_waiting(Automount *a) { + _cleanup_close_ int ioctl_fd = -1; + int p[2] = { -1, -1 }; + char name[STRLEN("systemd-") + DECIMAL_STR_MAX(pid_t) + 1]; + char options[STRLEN("fd=,pgrp=,minproto=5,maxproto=5,direct") + + DECIMAL_STR_MAX(int) + DECIMAL_STR_MAX(gid_t) + 1]; + bool mounted = false; + int r, dev_autofs_fd; + struct stat st; + + assert(a); + assert(a->pipe_fd < 0); + assert(a->where); + + set_clear(a->tokens); + + r = unit_fail_if_noncanonical(UNIT(a), a->where); + if (r < 0) + goto fail; + + (void) mkdir_p_label(a->where, 0555); + + unit_warn_if_dir_nonempty(UNIT(a), a->where); + + dev_autofs_fd = open_dev_autofs(UNIT(a)->manager); + if (dev_autofs_fd < 0) { + r = dev_autofs_fd; + goto fail; + } + + if (pipe2(p, O_CLOEXEC) < 0) { + r = -errno; + goto fail; + } + r = fd_nonblock(p[0], true); + if (r < 0) + goto fail; + + xsprintf(options, "fd=%i,pgrp="PID_FMT",minproto=5,maxproto=5,direct", p[1], getpgrp()); + xsprintf(name, "systemd-"PID_FMT, getpid_cached()); + if (mount(name, a->where, "autofs", 0, options) < 0) { + r = -errno; + goto fail; + } + + mounted = true; + + p[1] = safe_close(p[1]); + + if (stat(a->where, &st) < 0) { + r = -errno; + goto fail; + } + + ioctl_fd = open_ioctl_fd(dev_autofs_fd, a->where, st.st_dev); + if (ioctl_fd < 0) { + r = ioctl_fd; + goto fail; + } + + r = autofs_protocol(dev_autofs_fd, ioctl_fd); + if (r < 0) + goto fail; + + r = autofs_set_timeout(dev_autofs_fd, ioctl_fd, a->timeout_idle_usec); + if (r < 0) + goto fail; + + r = sd_event_add_io(UNIT(a)->manager->event, &a->pipe_event_source, p[0], EPOLLIN, automount_dispatch_io, a); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(a->pipe_event_source, "automount-io"); + + a->pipe_fd = p[0]; + a->dev_id = st.st_dev; + + automount_set_state(a, AUTOMOUNT_WAITING); + + return; + +fail: + log_unit_error_errno(UNIT(a), r, "Failed to initialize automounter: %m"); + + safe_close_pair(p); + + if (mounted) { + r = repeat_unmount(a->where, MNT_DETACH); + if (r < 0) + log_error_errno(r, "Failed to unmount, ignoring: %m"); + } + + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); +} + +static void *expire_thread(void *p) { + struct autofs_dev_ioctl param; + _cleanup_(expire_data_freep) struct expire_data *data = (struct expire_data*)p; + int r; + + assert(data->dev_autofs_fd >= 0); + assert(data->ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = data->ioctl_fd; + + do { + r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m); + } while (r >= 0); + + if (errno != EAGAIN) + log_warning_errno(errno, "Failed to expire automount, ignoring: %m"); + + return NULL; +} + +static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) { + Automount *a = AUTOMOUNT(userdata); + _cleanup_(expire_data_freep) struct expire_data *data = NULL; + int r; + + assert(a); + assert(source == a->expire_event_source); + + data = new0(struct expire_data, 1); + if (!data) + return log_oom(); + + data->ioctl_fd = -1; + + data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3); + if (data->dev_autofs_fd < 0) + return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m"); + + data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (data->ioctl_fd < 0) + return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m"); + + r = asynchronous_job(expire_thread, data); + if (r < 0) + return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m"); + + data = NULL; + + return automount_start_expire(a); +} + +static int automount_start_expire(Automount *a) { + int r; + usec_t timeout; + + assert(a); + + if (a->timeout_idle_usec == 0) + return 0; + + timeout = now(CLOCK_MONOTONIC) + MAX(a->timeout_idle_usec/3, USEC_PER_SEC); + + if (a->expire_event_source) { + r = sd_event_source_set_time(a->expire_event_source, timeout); + if (r < 0) + return r; + + return sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_ONESHOT); + } + + r = sd_event_add_time( + UNIT(a)->manager->event, + &a->expire_event_source, + CLOCK_MONOTONIC, timeout, 0, + automount_dispatch_expire, a); + if (r < 0) + return r; + + (void) sd_event_source_set_description(a->expire_event_source, "automount-expire"); + + return 0; +} + +static void automount_stop_expire(Automount *a) { + assert(a); + + if (!a->expire_event_source) + return; + + (void) sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_OFF); +} + +static void automount_enter_running(Automount *a) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + struct stat st; + int r; + + assert(a); + + /* If the user masked our unit in the meantime, fail */ + if (UNIT(a)->load_state != UNIT_LOADED) { + log_unit_error(UNIT(a), "Suppressing automount event since unit is no longer loaded."); + goto fail; + } + + /* We don't take mount requests anymore if we are supposed to + * shut down anyway */ + if (unit_stop_pending(UNIT(a))) { + log_unit_debug(UNIT(a), "Suppressing automount request since unit stop is scheduled."); + automount_send_ready(a, a->tokens, -EHOSTDOWN); + automount_send_ready(a, a->expire_tokens, -EHOSTDOWN); + return; + } + + mkdir_p_label(a->where, a->directory_mode); + + /* Before we do anything, let's see if somebody is playing games with us? */ + if (lstat(a->where, &st) < 0) { + log_unit_warning_errno(UNIT(a), errno, "Failed to stat automount point: %m"); + goto fail; + } + + /* The mount unit may have been explicitly started before we got the + * autofs request. Ack it to unblock anything waiting on the mount point. */ + if (!S_ISDIR(st.st_mode) || st.st_dev != a->dev_id) { + log_unit_info(UNIT(a), "Automount point already active?"); + automount_send_ready(a, a->tokens, 0); + return; + } + + trigger = UNIT_TRIGGER(UNIT(a)); + if (!trigger) { + log_unit_error(UNIT(a), "Unit to trigger vanished."); + goto fail; + } + + r = manager_add_job(UNIT(a)->manager, JOB_START, trigger, JOB_REPLACE, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(a), "Failed to queue mount startup job: %s", bus_error_message(&error, r)); + goto fail; + } + + automount_set_state(a, AUTOMOUNT_RUNNING); + return; + +fail: + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); +} + +static int automount_start(Unit *u) { + Automount *a = AUTOMOUNT(u); + Unit *trigger; + int r; + + assert(a); + assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED)); + + if (path_is_mount_point(a->where, NULL, 0) > 0) { + log_unit_error(u, "Path %s is already a mount point, refusing start.", a->where); + return -EEXIST; + } + + trigger = UNIT_TRIGGER(u); + if (!trigger || trigger->load_state != UNIT_LOADED) { + log_unit_error(u, "Refusing to start, unit to trigger not loaded."); + return -ENOENT; + } + + r = unit_start_limit_test(u); + if (r < 0) { + automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + a->result = AUTOMOUNT_SUCCESS; + automount_enter_waiting(a); + return 1; +} + +static int automount_stop(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)); + + automount_enter_dead(a, AUTOMOUNT_SUCCESS); + return 1; +} + +static int automount_serialize(Unit *u, FILE *f, FDSet *fds) { + Automount *a = AUTOMOUNT(u); + Iterator i; + void *p; + int r; + + assert(a); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", automount_state_to_string(a->state)); + (void) serialize_item(f, "result", automount_result_to_string(a->result)); + (void) serialize_item_format(f, "dev-id", "%lu", (unsigned long) a->dev_id); + + SET_FOREACH(p, a->tokens, i) + (void) serialize_item_format(f, "token", "%u", PTR_TO_UINT(p)); + SET_FOREACH(p, a->expire_tokens, i) + (void) serialize_item_format(f, "expire-token", "%u", PTR_TO_UINT(p)); + + r = serialize_fd(f, fds, "pipe-fd", a->pipe_fd); + if (r < 0) + return r; + + return 0; +} + +static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(fds); + + if (streq(key, "state")) { + AutomountState state; + + state = automount_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + a->deserialized_state = state; + } else if (streq(key, "result")) { + AutomountResult f; + + f = automount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != AUTOMOUNT_SUCCESS) + a->result = f; + + } else if (streq(key, "dev-id")) { + unsigned long d; + + if (safe_atolu(value, &d) < 0) + log_unit_debug(u, "Failed to parse dev-id value: %s", value); + else + a->dev_id = (dev_t) d; + + } else if (streq(key, "token")) { + unsigned token; + + if (safe_atou(value, &token) < 0) + log_unit_debug(u, "Failed to parse token value: %s", value); + else { + r = set_ensure_allocated(&a->tokens, NULL); + if (r < 0) { + log_oom(); + return 0; + } + + r = set_put(a->tokens, UINT_TO_PTR(token)); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add token to set: %m"); + } + } else if (streq(key, "expire-token")) { + unsigned token; + + if (safe_atou(value, &token) < 0) + log_unit_debug(u, "Failed to parse token value: %s", value); + else { + r = set_ensure_allocated(&a->expire_tokens, NULL); + if (r < 0) { + log_oom(); + return 0; + } + + r = set_put(a->expire_tokens, UINT_TO_PTR(token)); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add expire token to set: %m"); + } + } else if (streq(key, "pipe-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse pipe-fd value: %s", value); + else { + safe_close(a->pipe_fd); + a->pipe_fd = fdset_remove(fds, fd); + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState automount_active_state(Unit *u) { + assert(u); + + return state_translation_table[AUTOMOUNT(u)->state]; +} + +static const char *automount_sub_state_to_string(Unit *u) { + assert(u); + + return automount_state_to_string(AUTOMOUNT(u)->state); +} + +static bool automount_may_gc(Unit *u) { + Unit *t; + + assert(u); + + t = UNIT_TRIGGER(u); + if (!t) + return true; + + return UNIT_VTABLE(t)->may_gc(t); +} + +static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + union autofs_v5_packet_union packet; + Automount *a = AUTOMOUNT(userdata); + Unit *trigger; + int r; + + assert(a); + assert(fd == a->pipe_fd); + + if (events != EPOLLIN) { + log_unit_error(UNIT(a), "Got invalid poll event %"PRIu32" on pipe (fd=%d)", events, fd); + goto fail; + } + + r = loop_read_exact(a->pipe_fd, &packet, sizeof(packet), true); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Invalid read from pipe: %m"); + goto fail; + } + + switch (packet.hdr.type) { + + case autofs_ptype_missing_direct: + + if (packet.v5_packet.pid > 0) { + _cleanup_free_ char *p = NULL; + + get_process_comm(packet.v5_packet.pid, &p); + log_unit_info(UNIT(a), "Got automount request for %s, triggered by %"PRIu32" (%s)", a->where, packet.v5_packet.pid, strna(p)); + } else + log_unit_debug(UNIT(a), "Got direct mount request on %s", a->where); + + r = set_ensure_allocated(&a->tokens, NULL); + if (r < 0) { + log_unit_error(UNIT(a), "Failed to allocate token set."); + goto fail; + } + + r = set_put(a->tokens, UINT_TO_PTR(packet.v5_packet.wait_queue_token)); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m"); + goto fail; + } + + automount_enter_running(a); + break; + + case autofs_ptype_expire_direct: + log_unit_debug(UNIT(a), "Got direct umount request on %s", a->where); + + automount_stop_expire(a); + + r = set_ensure_allocated(&a->expire_tokens, NULL); + if (r < 0) { + log_unit_error(UNIT(a), "Failed to allocate token set."); + goto fail; + } + + r = set_put(a->expire_tokens, UINT_TO_PTR(packet.v5_packet.wait_queue_token)); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m"); + goto fail; + } + + trigger = UNIT_TRIGGER(UNIT(a)); + if (!trigger) { + log_unit_error(UNIT(a), "Unit to trigger vanished."); + goto fail; + } + + r = manager_add_job(UNIT(a)->manager, JOB_STOP, trigger, JOB_REPLACE, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(a), "Failed to queue umount startup job: %s", bus_error_message(&error, r)); + goto fail; + } + break; + + default: + log_unit_error(UNIT(a), "Received unknown automount request %i", packet.hdr.type); + break; + } + + return 0; + +fail: + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); + return 0; +} + +static void automount_shutdown(Manager *m) { + assert(m); + + m->dev_autofs_fd = safe_close(m->dev_autofs_fd); +} + +static void automount_reset_failed(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + if (a->state == AUTOMOUNT_FAILED) + automount_set_state(a, AUTOMOUNT_DEAD); + + a->result = AUTOMOUNT_SUCCESS; +} + +static bool automount_supported(void) { + static int supported = -1; + + if (supported < 0) + supported = access("/dev/autofs", F_OK) >= 0; + + return supported; +} + +static const char* const automount_result_table[_AUTOMOUNT_RESULT_MAX] = { + [AUTOMOUNT_SUCCESS] = "success", + [AUTOMOUNT_FAILURE_RESOURCES] = "resources", + [AUTOMOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT] = "mount-start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(automount_result, AutomountResult); + +const UnitVTable automount_vtable = { + .object_size = sizeof(Automount), + + .sections = + "Unit\0" + "Automount\0" + "Install\0", + + .init = automount_init, + .load = automount_load, + .done = automount_done, + + .coldplug = automount_coldplug, + + .dump = automount_dump, + + .start = automount_start, + .stop = automount_stop, + + .serialize = automount_serialize, + .deserialize_item = automount_deserialize_item, + + .active_state = automount_active_state, + .sub_state_to_string = automount_sub_state_to_string, + + .may_gc = automount_may_gc, + + .trigger_notify = automount_trigger_notify, + + .reset_failed = automount_reset_failed, + + .bus_vtable = bus_automount_vtable, + .bus_set_property = bus_automount_set_property, + + .can_transient = true, + + .shutdown = automount_shutdown, + .supported = automount_supported, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Set up automount %s.", + [JOB_FAILED] = "Failed to set up automount %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Unset automount %s.", + [JOB_FAILED] = "Failed to unset automount %s.", + }, + }, +}; diff --git a/src/core/automount.h b/src/core/automount.h new file mode 100644 index 0000000..21dd1c0 --- /dev/null +++ b/src/core/automount.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Automount Automount; + +#include "unit.h" + +typedef enum AutomountResult { + AUTOMOUNT_SUCCESS, + AUTOMOUNT_FAILURE_RESOURCES, + AUTOMOUNT_FAILURE_START_LIMIT_HIT, + AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT, + _AUTOMOUNT_RESULT_MAX, + _AUTOMOUNT_RESULT_INVALID = -1 +} AutomountResult; + +struct Automount { + Unit meta; + + AutomountState state, deserialized_state; + + char *where; + usec_t timeout_idle_usec; + + int pipe_fd; + sd_event_source *pipe_event_source; + mode_t directory_mode; + dev_t dev_id; + + Set *tokens; + Set *expire_tokens; + + sd_event_source *expire_event_source; + + AutomountResult result; +}; + +extern const UnitVTable automount_vtable; + +const char* automount_result_to_string(AutomountResult i) _const_; +AutomountResult automount_result_from_string(const char *s) _pure_; + +DEFINE_CAST(AUTOMOUNT, Automount); diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c new file mode 100644 index 0000000..81e91fc --- /dev/null +++ b/src/core/bpf-devices.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#include <linux/libbpf.h> + +#include "bpf-devices.h" +#include "bpf-program.h" + +#define PASS_JUMP_OFF 4096 + +static int bpf_access_type(const char *acc) { + int r = 0; + + assert(acc); + + for (; *acc; acc++) + switch(*acc) { + case 'r': + r |= BPF_DEVCG_ACC_READ; + break; + case 'w': + r |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + r |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return r; +} + +int cgroup_bpf_whitelist_device(BPFProgram *prog, int type, int major, int minor, const char *acc) { + struct bpf_insn insn[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 6), /* compare device type */ + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */ + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + int r, access; + + assert(prog); + assert(acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + insn[2].imm = access; + + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int cgroup_bpf_whitelist_major(BPFProgram *prog, int type, int major, const char *acc) { + struct bpf_insn insn[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 5), /* compare device type */ + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */ + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + int r, access; + + assert(prog); + assert(acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + insn[2].imm = access; + + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int cgroup_bpf_whitelist_class(BPFProgram *prog, int type, const char *acc) { + struct bpf_insn insn[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 5), /* compare device type */ + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */ + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 1), /* compare access type */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + int r, access; + + assert(prog); + assert(acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + insn[2].imm = access; + + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist) { + struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + int r; + + assert(ret); + + if (policy == CGROUP_AUTO && !whitelist) + return 0; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &prog); + if (r < 0) + return log_error_errno(r, "Loading device control BPF program failed: %m"); + + if (policy == CGROUP_CLOSED || whitelist) { + r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + } + + *ret = TAKE_PTR(prog); + + return 0; +} + +int cgroup_apply_device_bpf(Unit *u, BPFProgram *prog, CGroupDevicePolicy policy, bool whitelist) { + struct bpf_insn post_insn[] = { + /* return DENY */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + + }; + + struct bpf_insn exit_insn[] = { + /* else return ALLOW */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_free_ char *path = NULL; + int r; + + if (!prog) { + /* Remove existing program. */ + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); + return 0; + } + + if (policy != CGROUP_STRICT || whitelist) { + size_t off; + + r = bpf_program_add_instructions(prog, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + /* Fixup PASS_JUMP_OFF jump offsets. */ + for (off = 0; off < prog->n_instructions; off++) { + struct bpf_insn *ins = &prog->instructions[off]; + + if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF) + ins->off = prog->n_instructions - off - 1; + } + } else + /* Explicitly forbid everything. */ + exit_insn[0].imm = 0; + + r = bpf_program_add_instructions(prog, exit_insn, ELEMENTSOF(exit_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup path: %m"); + + r = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE, path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", path); + + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */ + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); + + /* Remember that this BPF program is installed now. */ + u->bpf_device_control_installed = bpf_program_ref(prog); + + return 0; +} + +int bpf_devices_supported(void) { + struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL; + static int supported = -1; + int r; + + /* Checks whether BPF device controller is supported. For this, we check five things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) { + log_debug("Not enough privileges, BPF device control is not supported."); + return supported = 0; + } + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + log_debug("Not running with unified cgroups, BPF device control is not supported."); + return supported = 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &program); + if (r < 0) { + log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + return supported = 1; +} diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h new file mode 100644 index 0000000..8d3de3b --- /dev/null +++ b/src/core/bpf-devices.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <inttypes.h> + +#include "unit.h" + +struct BPFProgram; + +int bpf_devices_supported(void); + +int cgroup_bpf_whitelist_device(BPFProgram *p, int type, int major, int minor, const char *acc); +int cgroup_bpf_whitelist_major(BPFProgram *p, int type, int major, const char *acc); +int cgroup_bpf_whitelist_class(BPFProgram *prog, int type, const char *acc); + +int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist); +int cgroup_apply_device_bpf(Unit *u, BPFProgram *p, CGroupDevicePolicy policy, bool whitelist); diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c new file mode 100644 index 0000000..b9a611f --- /dev/null +++ b/src/core/bpf-firewall.c @@ -0,0 +1,767 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/libbpf.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "ip-address-access.h" +#include "missing_syscall.h" +#include "unit.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +enum { + ACCESS_ALLOWED = 1, + ACCESS_DENIED = 2, +}; + +/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */ + +static int add_lookup_instructions( + BPFProgram *p, + int map_fd, + int protocol, + bool is_ingress, + int verdict) { + + int r, addr_offset, addr_size; + + assert(p); + assert(map_fd >= 0); + + switch (protocol) { + + case ETH_P_IP: + addr_size = sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct iphdr, saddr) : + offsetof(struct iphdr, daddr); + break; + + case ETH_P_IPV6: + addr_size = 4 * sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct ip6_hdr, ip6_src.s6_addr) : + offsetof(struct ip6_hdr, ip6_dst.s6_addr); + break; + + default: + return -EAFNOSUPPORT; + } + + do { + /* Compare IPv4 with one word instruction (32bit) */ + struct bpf_insn insn[] = { + /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0), + + /* + * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address + * + * R1: Pointer to the skb + * R2: Data offset + * R3: Destination buffer on the stack (r10 - 4) + * R4: Number of bytes to read (4) + */ + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV32_IMM(BPF_REG_2, addr_offset), + + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size), + + BPF_MOV32_IMM(BPF_REG_4, addr_size), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + + /* + * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the + * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key' + * has to be set to the maximum possible value. + * + * On success, the looked up value is stored in R0. For this application, the actual + * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any + * matching value. + */ + + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)), + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8), + + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + + } while (false); + + return 0; +} + +static int bpf_firewall_compile_bpf( + Unit *u, + bool is_ingress, + BPFProgram **ret) { + + struct bpf_insn pre_insn[] = { + /* + * When the eBPF program is entered, R1 contains the address of the skb. + * However, R1-R5 are scratch registers that are not preserved when calling + * into kernel functions, so we need to save anything that's supposed to + * stay around to R6-R9. Save the skb to R6. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* + * Although we cannot access the skb data directly from eBPF programs used in this + * scenario, the kernel has prepared some fields for us to access through struct __sk_buff. + * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7 + * for later use. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)), + + /* + * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet + * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning. + */ + BPF_MOV32_IMM(BPF_REG_8, 0), + }; + + /* + * The access checkers compiled for the configured allowance and denial lists + * write to R8 at runtime. The following code prepares for an early exit that + * skip the accounting if the packet is denied. + * + * R0 = 1 + * if (R8 == ACCESS_DENIED) + * R0 = 0 + * + * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet + * is allowed to pass. + */ + struct bpf_insn post_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; + int accounting_map_fd, r; + bool access_enabled; + + assert(u); + assert(ret); + + accounting_map_fd = is_ingress ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + + access_enabled = + u->ipv4_allow_map_fd >= 0 || + u->ipv6_allow_map_fd >= 0 || + u->ipv4_deny_map_fd >= 0 || + u->ipv6_deny_map_fd >= 0; + + if (accounting_map_fd < 0 && !access_enabled) { + *ret = NULL; + return 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p); + if (r < 0) + return r; + + r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return r; + + if (access_enabled) { + /* + * The simple rule this function translates into eBPF instructions is: + * + * - Access will be granted when an address matches an entry in @list_allow + * - Otherwise, access will be denied when an address matches an entry in @list_deny + * - Otherwise, access will be granted + */ + + if (u->ipv4_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv6_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv4_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (u->ipv6_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + } + + r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return r; + + if (accounting_map_fd >= 0) { + struct bpf_insn insn[] = { + /* + * If R0 == 0, the packet will be denied; skip the accounting instructions in this case. + * The jump label will be fixed up later. + */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0), + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Allow the packet to pass */ + BPF_MOV64_IMM(BPF_REG_0, 1), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } + + do { + /* + * Exit from the eBPF program, R0 contains the verdict. + * 0 means the packet is denied, 1 means the packet may pass. + */ + struct bpf_insn insn[] = { + BPF_EXIT_INSN() + }; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } while (false); + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) { + IPAddressAccessItem *a; + + assert(n_ipv4); + assert(n_ipv6); + + LIST_FOREACH(items, a, list) { + switch (a->family) { + + case AF_INET: + (*n_ipv4)++; + break; + + case AF_INET6: + (*n_ipv6)++; + break; + + default: + return -EAFNOSUPPORT; + } + } + + return 0; +} + +static int bpf_firewall_add_access_items( + IPAddressAccessItem *list, + int ipv4_map_fd, + int ipv6_map_fd, + int verdict) { + + struct bpf_lpm_trie_key *key_ipv4, *key_ipv6; + uint64_t value = verdict; + IPAddressAccessItem *a; + int r; + + key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)); + key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4); + + LIST_FOREACH(items, a, list) { + switch (a->family) { + + case AF_INET: + key_ipv4->prefixlen = a->prefixlen; + memcpy(key_ipv4->data, &a->address, sizeof(uint32_t)); + + r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value); + if (r < 0) + return r; + + break; + + case AF_INET6: + key_ipv6->prefixlen = a->prefixlen; + memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t)); + + r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value); + if (r < 0) + return r; + + break; + + default: + return -EAFNOSUPPORT; + } + } + + return 0; +} + +static int bpf_firewall_prepare_access_maps( + Unit *u, + int verdict, + int *ret_ipv4_map_fd, + int *ret_ipv6_map_fd) { + + _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1; + size_t n_ipv4 = 0, n_ipv6 = 0; + Unit *p; + int r; + + assert(ret_ipv4_map_fd); + assert(ret_ipv6_map_fd); + + for (p = u; p; p = UNIT_DEREF(p->slice)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6); + } + + if (n_ipv4 > 0) { + ipv4_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t), + sizeof(uint64_t), + n_ipv4, + BPF_F_NO_PREALLOC); + if (ipv4_map_fd < 0) + return ipv4_map_fd; + } + + if (n_ipv6 > 0) { + ipv6_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4, + sizeof(uint64_t), + n_ipv6, + BPF_F_NO_PREALLOC); + if (ipv6_map_fd < 0) + return ipv6_map_fd; + } + + for (p = u; p; p = UNIT_DEREF(p->slice)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, + ipv4_map_fd, ipv6_map_fd, verdict); + if (r < 0) + return r; + } + + *ret_ipv4_map_fd = ipv4_map_fd; + *ret_ipv6_map_fd = ipv6_map_fd; + + ipv4_map_fd = ipv6_map_fd = -1; + return 0; +} + +static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) { + int r; + + assert(u); + assert(fd_ingress); + assert(fd_egress); + + if (enabled) { + if (*fd_ingress < 0) { + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_ingress = r; + } + + if (*fd_egress < 0) { + + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_egress = r; + } + + } else { + *fd_ingress = safe_close(*fd_ingress); + *fd_egress = safe_close(*fd_egress); + + zero(u->ip_accounting_extra); + } + + return 0; +} + +int bpf_firewall_compile(Unit *u) { + CGroupContext *cc; + int r, supported; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) { + log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without."); + return -EOPNOTSUPP; + } + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) { + /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice + * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption + * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is + * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at + * all, either. */ + log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units."); + return -EOPNOTSUPP; + } + + /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves, + * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual + * configuration, but we don't flush out the accounting unnecessarily */ + + u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress); + u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + if (u->type != UNIT_SLICE) { + /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf + * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that + * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this + * means that all configure IP access rules *will* take effect on processes, even though we never + * compile them for inner nodes. */ + + r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd); + if (r < 0) + return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m"); + + r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd); + if (r < 0) + return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m"); + } + + r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd); + if (r < 0) + return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m"); + + r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress); + if (r < 0) + return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m"); + + r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress); + if (r < 0) + return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m"); + + return 0; +} + +int bpf_firewall_install(Unit *u) { + _cleanup_free_ char *path = NULL; + CGroupContext *cc; + int r, supported; + uint32_t flags; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + if (!u->cgroup_path) + return -EINVAL; + if (!u->cgroup_realized) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) { + log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without."); + return -EOPNOTSUPP; + } + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) { + log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units."); + return -EOPNOTSUPP; + } + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m"); + + flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI && + (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0; + + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to + * minimize the time window when we don't account for IP traffic. */ + u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed); + u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed); + + if (u->ip_bpf_egress) { + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags); + if (r < 0) + return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path); + + /* Remember that this BPF program is installed now. */ + u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress); + } + + if (u->ip_bpf_ingress) { + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags); + if (r < 0) + return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path); + + u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress); + } + + return 0; +} + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) { + uint64_t key, packets; + int r; + + if (map_fd < 0) + return -EBADF; + + if (ret_packets) { + key = MAP_KEY_PACKETS; + r = bpf_map_lookup_element(map_fd, &key, &packets); + if (r < 0) + return r; + } + + if (ret_bytes) { + key = MAP_KEY_BYTES; + r = bpf_map_lookup_element(map_fd, &key, ret_bytes); + if (r < 0) + return r; + } + + if (ret_packets) + *ret_packets = packets; + + return 0; +} + +int bpf_firewall_reset_accounting(int map_fd) { + uint64_t key, value = 0; + int r; + + if (map_fd < 0) + return -EBADF; + + key = MAP_KEY_PACKETS; + r = bpf_map_update_element(map_fd, &key, &value); + if (r < 0) + return r; + + key = MAP_KEY_BYTES; + return bpf_map_update_element(map_fd, &key, &value); +} + +int bpf_firewall_supported(void) { + struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL; + static int supported = -1; + union bpf_attr attr; + int fd, r; + + /* Checks whether BPF firewalling is supported. For this, we check five things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require + * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require + * e) the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) { + log_debug("Not enough privileges, BPF firewalling is not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + log_debug("Not running with unified cgroups, BPF firewalling is not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t), + sizeof(uint64_t), + 1, + BPF_F_NO_PREALLOC); + if (fd < 0) { + log_debug_errno(fd, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + safe_close(fd); + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program); + if (r < 0) { + log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF + * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF + * program if we can't do a thing with it later? + * + * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if + * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the + * parameters are validated however, and that'll fail with EBADF then. */ + + attr = (union bpf_attr) { + .attach_type = BPF_CGROUP_INET_EGRESS, + .target_fd = -1, + .attach_bpf_fd = -1, + }; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) { + if (errno != EBADF) { + log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* YAY! */ + } else { + log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported + * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH + * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll + * get EINVAL if it's not supported, and EBADF as before if it is available. */ + + attr = (union bpf_attr) { + .attach_type = BPF_CGROUP_INET_EGRESS, + .target_fd = -1, + .attach_bpf_fd = -1, + .attach_flags = BPF_F_ALLOW_MULTI, + }; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) { + if (errno == EBADF) { + log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!"); + return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI; + } + + if (errno == EINVAL) + log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported."); + else + log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m"); + + return supported = BPF_FIREWALL_SUPPORTED; + } else { + log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } +} diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h new file mode 100644 index 0000000..7d38483 --- /dev/null +++ b/src/core/bpf-firewall.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <inttypes.h> + +#include "unit.h" + +enum { + BPF_FIREWALL_UNSUPPORTED = 0, + BPF_FIREWALL_SUPPORTED = 1, + BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2, +}; + +int bpf_firewall_supported(void); + +int bpf_firewall_compile(Unit *u); +int bpf_firewall_install(Unit *u); + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets); +int bpf_firewall_reset_accounting(int map_fd); diff --git a/src/core/cgroup.c b/src/core/cgroup.c new file mode 100644 index 0000000..18d470b --- /dev/null +++ b/src/core/cgroup.c @@ -0,0 +1,3074 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <fcntl.h> +#include <fnmatch.h> + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "bpf-firewall.h" +#include "btrfs-util.h" +#include "bpf-devices.h" +#include "bus-error.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "special.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "virt.h" + +#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) + +/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access + * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask + * out specific attributes from us. */ +#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING) + +bool manager_owns_host_root_cgroup(Manager *m) { + assert(m); + + /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the + * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's + * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if + * we run in any kind of container virtualization. */ + + if (MANAGER_IS_USER(m)) + return false; + + if (detect_container() > 0) + return false; + + return empty_or_root(m->cgroup_root); +} + +bool unit_has_host_root_cgroup(Unit *u) { + assert(u); + + /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and + * the manager manages the root cgroup. */ + + if (!manager_owns_host_root_cgroup(u->manager)) + return false; + + return unit_has_name(u, SPECIAL_ROOT_SLICE); +} + +static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) { + int r; + + r = cg_set_attribute(controller, u->cgroup_path, attribute, value); + if (r < 0) + log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m", + strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value); + + return r; +} + +static void cgroup_compat_warn(void) { + static bool cgroup_compat_warned = false; + + if (cgroup_compat_warned) + return; + + log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. " + "See cgroup-compat debug messages for details."); + + cgroup_compat_warned = true; +} + +#define log_cgroup_compat(unit, fmt, ...) do { \ + cgroup_compat_warn(); \ + log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \ + } while (false) + +void cgroup_context_init(CGroupContext *c) { + assert(c); + + /* Initialize everything to the kernel defaults. */ + + *c = (CGroupContext) { + .cpu_weight = CGROUP_WEIGHT_INVALID, + .startup_cpu_weight = CGROUP_WEIGHT_INVALID, + .cpu_quota_per_sec_usec = USEC_INFINITY, + + .cpu_shares = CGROUP_CPU_SHARES_INVALID, + .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID, + + .memory_high = CGROUP_LIMIT_MAX, + .memory_max = CGROUP_LIMIT_MAX, + .memory_swap_max = CGROUP_LIMIT_MAX, + + .memory_limit = CGROUP_LIMIT_MAX, + + .io_weight = CGROUP_WEIGHT_INVALID, + .startup_io_weight = CGROUP_WEIGHT_INVALID, + + .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + + .tasks_max = CGROUP_LIMIT_MAX, + }; +} + +void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) { + assert(c); + assert(a); + + LIST_REMOVE(device_allow, c->device_allow, a); + free(a->path); + free(a); +} + +void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) { + assert(c); + assert(w); + + LIST_REMOVE(device_weights, c->io_device_weights, w); + free(w->path); + free(w); +} + +void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) { + assert(c); + assert(l); + + LIST_REMOVE(device_latencies, c->io_device_latencies, l); + free(l->path); + free(l); +} + +void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) { + assert(c); + assert(l); + + LIST_REMOVE(device_limits, c->io_device_limits, l); + free(l->path); + free(l); +} + +void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) { + assert(c); + assert(w); + + LIST_REMOVE(device_weights, c->blockio_device_weights, w); + free(w->path); + free(w); +} + +void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) { + assert(c); + assert(b); + + LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b); + free(b->path); + free(b); +} + +void cgroup_context_done(CGroupContext *c) { + assert(c); + + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + while (c->io_device_limits) + cgroup_context_free_io_device_limit(c, c->io_device_limits); + + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + while (c->blockio_device_bandwidths) + cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths); + + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow); + c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny); +} + +void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { + CGroupIODeviceLimit *il; + CGroupIODeviceWeight *iw; + CGroupIODeviceLatency *l; + CGroupBlockIODeviceBandwidth *b; + CGroupBlockIODeviceWeight *w; + CGroupDeviceAllow *a; + IPAddressAccessItem *iaai; + char u[FORMAT_TIMESPAN_MAX]; + + assert(c); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%sCPUAccounting=%s\n" + "%sIOAccounting=%s\n" + "%sBlockIOAccounting=%s\n" + "%sMemoryAccounting=%s\n" + "%sTasksAccounting=%s\n" + "%sIPAccounting=%s\n" + "%sCPUWeight=%" PRIu64 "\n" + "%sStartupCPUWeight=%" PRIu64 "\n" + "%sCPUShares=%" PRIu64 "\n" + "%sStartupCPUShares=%" PRIu64 "\n" + "%sCPUQuotaPerSecSec=%s\n" + "%sIOWeight=%" PRIu64 "\n" + "%sStartupIOWeight=%" PRIu64 "\n" + "%sBlockIOWeight=%" PRIu64 "\n" + "%sStartupBlockIOWeight=%" PRIu64 "\n" + "%sMemoryMin=%" PRIu64 "\n" + "%sMemoryLow=%" PRIu64 "\n" + "%sMemoryHigh=%" PRIu64 "\n" + "%sMemoryMax=%" PRIu64 "\n" + "%sMemorySwapMax=%" PRIu64 "\n" + "%sMemoryLimit=%" PRIu64 "\n" + "%sTasksMax=%" PRIu64 "\n" + "%sDevicePolicy=%s\n" + "%sDelegate=%s\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), + prefix, yes_no(c->memory_accounting), + prefix, yes_no(c->tasks_accounting), + prefix, yes_no(c->ip_accounting), + prefix, c->cpu_weight, + prefix, c->startup_cpu_weight, + prefix, c->cpu_shares, + prefix, c->startup_cpu_shares, + prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1), + prefix, c->io_weight, + prefix, c->startup_io_weight, + prefix, c->blockio_weight, + prefix, c->startup_blockio_weight, + prefix, c->memory_min, + prefix, c->memory_low, + prefix, c->memory_high, + prefix, c->memory_max, + prefix, c->memory_swap_max, + prefix, c->memory_limit, + prefix, c->tasks_max, + prefix, cgroup_device_policy_to_string(c->device_policy), + prefix, yes_no(c->delegate)); + + if (c->delegate) { + _cleanup_free_ char *t = NULL; + + (void) cg_mask_to_string(c->delegate_controllers, &t); + + fprintf(f, "%sDelegateControllers=%s\n", + prefix, + strempty(t)); + } + + LIST_FOREACH(device_allow, a, c->device_allow) + fprintf(f, + "%sDeviceAllow=%s %s%s%s\n", + prefix, + a->path, + a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : ""); + + LIST_FOREACH(device_weights, iw, c->io_device_weights) + fprintf(f, + "%sIODeviceWeight=%s %" PRIu64 "\n", + prefix, + iw->path, + iw->weight); + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) + fprintf(f, + "%sIODeviceLatencyTargetSec=%s %s\n", + prefix, + l->path, + format_timespan(u, sizeof(u), l->target_usec, 1)); + + LIST_FOREACH(device_limits, il, c->io_device_limits) { + char buf[FORMAT_BYTES_MAX]; + CGroupIOLimitType type; + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + if (il->limits[type] != cgroup_io_limit_defaults[type]) + fprintf(f, + "%s%s=%s %s\n", + prefix, + cgroup_io_limit_type_to_string(type), + il->path, + format_bytes(buf, sizeof(buf), il->limits[type])); + } + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) + fprintf(f, + "%sBlockIODeviceWeight=%s %" PRIu64, + prefix, + w->path, + w->weight); + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + char buf[FORMAT_BYTES_MAX]; + + if (b->rbps != CGROUP_LIMIT_MAX) + fprintf(f, + "%sBlockIOReadBandwidth=%s %s\n", + prefix, + b->path, + format_bytes(buf, sizeof(buf), b->rbps)); + if (b->wbps != CGROUP_LIMIT_MAX) + fprintf(f, + "%sBlockIOWriteBandwidth=%s %s\n", + prefix, + b->path, + format_bytes(buf, sizeof(buf), b->wbps)); + } + + LIST_FOREACH(items, iaai, c->ip_address_allow) { + _cleanup_free_ char *k = NULL; + + (void) in_addr_to_string(iaai->family, &iaai->address, &k); + fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen); + } + + LIST_FOREACH(items, iaai, c->ip_address_deny) { + _cleanup_free_ char *k = NULL; + + (void) in_addr_to_string(iaai->family, &iaai->address, &k); + fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen); + } +} + +int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) { + _cleanup_free_ CGroupDeviceAllow *a = NULL; + _cleanup_free_ char *d = NULL; + + assert(c); + assert(dev); + assert(isempty(mode) || in_charset(mode, "rwm")); + + a = new(CGroupDeviceAllow, 1); + if (!a) + return -ENOMEM; + + d = strdup(dev); + if (!d) + return -ENOMEM; + + *a = (CGroupDeviceAllow) { + .path = TAKE_PTR(d), + .r = isempty(mode) || strchr(mode, 'r'), + .w = isempty(mode) || strchr(mode, 'w'), + .m = isempty(mode) || strchr(mode, 'm'), + }; + + LIST_PREPEND(device_allow, c->device_allow, a); + TAKE_PTR(a); + + return 0; +} + +static void cgroup_xattr_apply(Unit *u) { + char ids[SD_ID128_STRING_MAX]; + int r; + + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (sd_id128_is_null(u->invocation_id)) + return; + + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + "trusted.invocation_id", + sd_id128_to_string(u->invocation_id, ids), 32, + 0); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path); +} + +static int lookup_block_device(const char *p, dev_t *ret) { + dev_t rdev, dev = 0; + mode_t mode; + int r; + + assert(p); + assert(ret); + + r = device_path_parse_major_minor(p, &mode, &rdev); + if (r == -ENODEV) { /* not a parsable device node, need to go to disk */ + struct stat st; + if (stat(p, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device '%s': %m", p); + rdev = (dev_t)st.st_rdev; + dev = (dev_t)st.st_dev; + mode = st.st_mode; + } else if (r < 0) + return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p); + + if (S_ISCHR(mode)) { + log_warning("Device node '%s' is a character device, but block device needed.", p); + return -ENOTBLK; + } else if (S_ISBLK(mode)) + *ret = rdev; + else if (major(dev) != 0) + *ret = dev; /* If this is not a device node then use the block device this file is stored on */ + else { + /* If this is btrfs, getting the backing block device is a bit harder */ + r = btrfs_get_block_device(p, ret); + if (r < 0 && r != -ENOTTY) + return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p); + if (r == -ENOTTY) { + log_warning("'%s' is not a block device node, and file system block device cannot be determined or is not local.", p); + return -ENODEV; + } + } + + /* If this is a LUKS device, try to get the originating block device */ + (void) block_get_originating(*ret, ret); + + /* If this is a partition, try to get the originating block device */ + (void) block_get_whole_disk(*ret, ret); + return 0; +} + +static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) { + dev_t rdev; + mode_t mode; + int r; + + assert(path); + assert(acc); + + /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and + * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This + * means clients can use these path without the device node actually around */ + r = device_path_parse_major_minor(node, &mode, &rdev); + if (r < 0) { + if (r != -ENODEV) + return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); + + struct stat st; + if (stat(node, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device %s: %m", node); + + if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) { + log_warning("%s is not a device.", node); + return -ENODEV; + } + rdev = (dev_t) st.st_rdev; + mode = st.st_mode; + } + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + return cgroup_bpf_whitelist_device(prog, S_ISCHR(mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, + major(rdev), minor(rdev), acc); + + } else { + char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4]; + + sprintf(buf, + "%c %u:%u %s", + S_ISCHR(mode) ? 'c' : 'b', + major(rdev), minor(rdev), + acc); + + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL here. */ + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + + return 0; + } +} + +static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) { + _cleanup_fclose_ FILE *f = NULL; + char buf[2+DECIMAL_STR_MAX(unsigned)+3+4]; + bool good = false; + unsigned maj; + int r; + + assert(path); + assert(acc); + assert(IN_SET(type, 'b', 'c')); + + if (streq(name, "*")) { + /* If the name is a wildcard, then apply this list to all devices of this type */ + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + (void) cgroup_bpf_whitelist_class(prog, type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, acc); + } else { + xsprintf(buf, "%c *:* %s", type, acc); + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set devices.allow on %s: %m", path); + return 0; + } + } + + if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) { + /* The name is numeric and suitable as major. In that case, let's take is major, and create the entry + * directly */ + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + (void) cgroup_bpf_whitelist_major(prog, + type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, + maj, acc); + } else { + xsprintf(buf, "%c %u:* %s", type, maj, acc); + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set devices.allow on %s: %m", path); + } + + return 0; + } + + f = fopen("/proc/devices", "re"); + if (!f) + return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *w, *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/devices: %m"); + if (r == 0) + break; + + if (type == 'c' && streq(line, "Character devices:")) { + good = true; + continue; + } + + if (type == 'b' && streq(line, "Block devices:")) { + good = true; + continue; + } + + if (isempty(line)) { + good = false; + continue; + } + + if (!good) + continue; + + p = strstrip(line); + + w = strpbrk(p, WHITESPACE); + if (!w) + continue; + *w = 0; + + r = safe_atou(p, &maj); + if (r < 0) + continue; + if (maj <= 0) + continue; + + w++; + w += strspn(w, WHITESPACE); + + if (fnmatch(name, w, 0) != 0) + continue; + + if (cg_all_unified() > 0) { + if (!prog) + continue; + + (void) cgroup_bpf_whitelist_major(prog, + type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, + maj, acc); + } else { + sprintf(buf, + "%c %u:* %s", + type, + maj, + acc); + + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL + * here. */ + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + } + } + + return 0; +} + +static bool cgroup_context_has_cpu_weight(CGroupContext *c) { + return c->cpu_weight != CGROUP_WEIGHT_INVALID || + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID; +} + +static bool cgroup_context_has_cpu_shares(CGroupContext *c) { + return c->cpu_shares != CGROUP_CPU_SHARES_INVALID || + c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID; +} + +static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID) + return c->startup_cpu_weight; + else if (c->cpu_weight != CGROUP_WEIGHT_INVALID) + return c->cpu_weight; + else + return CGROUP_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID) + return c->startup_cpu_shares; + else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID) + return c->cpu_shares; + else + return CGROUP_CPU_SHARES_DEFAULT; +} + +static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf); +} + +static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota) { + char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1]; + + if (quota != USEC_INFINITY) + xsprintf(buf, USEC_FMT " " USEC_FMT "\n", + quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC); + else + xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); + (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf); +} + +static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + xsprintf(buf, "%" PRIu64 "\n", shares); + (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf); +} + +static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota) { + char buf[DECIMAL_STR_MAX(usec_t) + 2]; + + xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf); + + if (quota != USEC_INFINITY) { + xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf); + } else + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n"); +} + +static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) { + return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT, + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) { + return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT, + CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX); +} + +static bool cgroup_context_has_io_config(CGroupContext *c) { + return c->io_accounting || + c->io_weight != CGROUP_WEIGHT_INVALID || + c->startup_io_weight != CGROUP_WEIGHT_INVALID || + c->io_device_weights || + c->io_device_latencies || + c->io_device_limits; +} + +static bool cgroup_context_has_blockio_config(CGroupContext *c) { + return c->blockio_accounting || + c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->blockio_device_weights || + c->blockio_device_bandwidths; +} + +static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_io_weight != CGROUP_WEIGHT_INVALID) + return c->startup_io_weight; + else if (c->io_weight != CGROUP_WEIGHT_INVALID) + return c->io_weight; + else + return CGROUP_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) + return c->startup_blockio_weight; + else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) + return c->blockio_weight; + else + return CGROUP_BLKIO_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) { + return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT, + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) { + return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT, + CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX); +} + +static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); +} + +static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf); +} + +static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + if (target != USEC_INFINITY) + xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target); + else + xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev)); + + (void) set_attribute_and_warn(u, "io", "io.latency", buf); +} + +static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) { + char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)]; + char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4]; + CGroupIOLimitType type; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + if (limits[type] != cgroup_io_limit_defaults[type]) + xsprintf(limit_bufs[type], "%" PRIu64, limits[type]); + else + xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0"); + + xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev), + limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX], + limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]); + (void) set_attribute_and_warn(u, "io", "io.max", buf); +} + +static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf); + + sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf); +} + +static bool cgroup_context_has_unified_memory_config(CGroupContext *c) { + return c->memory_min > 0 || c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX; +} + +static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n"; + + if (v != CGROUP_LIMIT_MAX) + xsprintf(buf, "%" PRIu64 "\n", v); + + (void) set_attribute_and_warn(u, "memory", file, buf); +} + +static void cgroup_apply_firewall(Unit *u) { + assert(u); + + /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */ + + if (bpf_firewall_compile(u) < 0) + return; + + (void) bpf_firewall_install(u); +} + +static void cgroup_context_apply( + Unit *u, + CGroupMask apply_mask, + ManagerState state) { + + const char *path; + CGroupContext *c; + bool is_host_root, is_local_root; + int r; + + assert(u); + + /* Nothing to do? Exit early! */ + if (apply_mask == 0) + return; + + /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other + * attributes should only be managed for cgroups further down the tree. */ + is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE); + is_host_root = unit_has_host_root_cgroup(u); + + assert_se(c = unit_get_cgroup_context(u)); + assert_se(path = u->cgroup_path); + + if (is_local_root) /* Make sure we don't try to display messages with an empty path. */ + path = "/"; + + /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container + * then), and missing cgroups, i.e. EROFS and ENOENT. */ + + /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but + * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this + * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of + * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used + * we couldn't even write to them if we wanted to). */ + if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) { + + if (cg_all_unified() > 0) { + uint64_t weight; + + if (cgroup_context_has_cpu_weight(c)) + weight = cgroup_context_cpu_weight(c, state); + else if (cgroup_context_has_cpu_shares(c)) { + uint64_t shares; + + shares = cgroup_context_cpu_shares(c, state); + weight = cgroup_cpu_shares_to_weight(shares); + + log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s", + shares, weight, path); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + cgroup_apply_unified_cpu_weight(u, weight); + cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec); + + } else { + uint64_t shares; + + if (cgroup_context_has_cpu_weight(c)) { + uint64_t weight; + + weight = cgroup_context_cpu_weight(c, state); + shares = cgroup_cpu_weight_to_shares(weight); + + log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s", + weight, shares, path); + } else if (cgroup_context_has_cpu_shares(c)) + shares = cgroup_context_cpu_shares(c, state); + else + shares = CGROUP_CPU_SHARES_DEFAULT; + + cgroup_apply_legacy_cpu_shares(u, shares); + cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec); + } + } + + /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2 + * controller), and in case of containers we want to leave control of these attributes to the container manager + * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */ + if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) { + char buf[8+DECIMAL_STR_MAX(uint64_t)+1]; + bool has_io, has_blockio; + uint64_t weight; + + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + if (has_io) + weight = cgroup_context_io_weight(c, state); + else if (has_blockio) { + uint64_t blkio_weight; + + blkio_weight = cgroup_context_blkio_weight(c, state); + weight = cgroup_weight_blkio_to_io(blkio_weight); + + log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64, + blkio_weight, weight); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + xsprintf(buf, "default %" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); + + if (has_io) { + CGroupIODeviceLatency *latency; + CGroupIODeviceLimit *limit; + CGroupIODeviceWeight *w; + + LIST_FOREACH(device_weights, w, c->io_device_weights) + cgroup_apply_io_device_weight(u, w->path, w->weight); + + LIST_FOREACH(device_limits, limit, c->io_device_limits) + cgroup_apply_io_device_limit(u, limit->path, limit->limits); + + LIST_FOREACH(device_latencies, latency, c->io_device_latencies) + cgroup_apply_io_device_latency(u, latency->path, latency->target_usec); + + } else if (has_blockio) { + CGroupBlockIODeviceWeight *w; + CGroupBlockIODeviceBandwidth *b; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + weight = cgroup_weight_blkio_to_io(w->weight); + + log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_io_device_weight(u, w->path, weight); + } + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; + CGroupIOLimitType type; + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + limits[type] = cgroup_io_limit_defaults[type]; + + limits[CGROUP_IO_RBPS_MAX] = b->rbps; + limits[CGROUP_IO_WBPS_MAX] = b->wbps; + + log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s", + b->rbps, b->wbps, b->path); + + cgroup_apply_io_device_limit(u, b->path, limits); + } + } + } + + if (apply_mask & CGROUP_MASK_BLKIO) { + bool has_io, has_blockio; + + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be + * left to our container manager, too. */ + if (!is_local_root) { + char buf[DECIMAL_STR_MAX(uint64_t)+1]; + uint64_t weight; + + if (has_io) { + uint64_t io_weight; + + io_weight = cgroup_context_io_weight(c, state); + weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state)); + + log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64, + io_weight, weight); + } else if (has_blockio) + weight = cgroup_context_blkio_weight(c, state); + else + weight = CGROUP_BLKIO_WEIGHT_DEFAULT; + + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf); + + if (has_io) { + CGroupIODeviceWeight *w; + + LIST_FOREACH(device_weights, w, c->io_device_weights) { + weight = cgroup_weight_io_to_blkio(w->weight); + + log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_blkio_device_weight(u, w->path, weight); + } + } else if (has_blockio) { + CGroupBlockIODeviceWeight *w; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) + cgroup_apply_blkio_device_weight(u, w->path, w->weight); + } + } + + /* The bandwith limits are something that make sense to be applied to the host's root but not container + * roots, as there we want the container manager to handle it */ + if (is_host_root || !is_local_root) { + if (has_io) { + CGroupIODeviceLimit *l; + + LIST_FOREACH(device_limits, l, c->io_device_limits) { + log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s", + l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path); + + cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]); + } + } else if (has_blockio) { + CGroupBlockIODeviceBandwidth *b; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps); + } + } + } + + /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes' + * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we + * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even + * write to this if we wanted to.) */ + if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) { + + if (cg_all_unified() > 0) { + uint64_t max, swap_max = CGROUP_LIMIT_MAX; + + if (cgroup_context_has_unified_memory_config(c)) { + max = c->memory_max; + swap_max = c->memory_swap_max; + } else { + max = c->memory_limit; + + if (max != CGROUP_LIMIT_MAX) + log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max); + } + + cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min); + cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low); + cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high); + cgroup_apply_unified_memory_limit(u, "memory.max", max); + cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); + + } else { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t val; + + if (cgroup_context_has_unified_memory_config(c)) { + val = c->memory_max; + log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val); + } else + val = c->memory_limit; + + if (val == CGROUP_LIMIT_MAX) + strncpy(buf, "-1\n", sizeof(buf)); + else + xsprintf(buf, "%" PRIu64 "\n", val); + + (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf); + } + } + + /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of + * containers, where we leave this to the manager */ + if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && + (is_host_root || cg_all_unified() > 0 || !is_local_root)) { + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + CGroupDeviceAllow *a; + + if (cg_all_unified() > 0) { + r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m"); + } else { + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL + * here. */ + + if (c->device_allow || c->device_policy != CGROUP_AUTO) + r = cg_set_attribute("devices", path, "devices.deny", "a"); + else + r = cg_set_attribute("devices", path, "devices.allow", "a"); + if (r < 0) + log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to reset devices.allow/devices.deny: %m"); + } + + if (c->device_policy == CGROUP_CLOSED || + (c->device_policy == CGROUP_AUTO && c->device_allow)) { + static const char auto_devices[] = + "/dev/null\0" "rwm\0" + "/dev/zero\0" "rwm\0" + "/dev/full\0" "rwm\0" + "/dev/random\0" "rwm\0" + "/dev/urandom\0" "rwm\0" + "/dev/tty\0" "rwm\0" + "/dev/ptmx\0" "rwm\0" + /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */ + "/run/systemd/inaccessible/chr\0" "rwm\0" + "/run/systemd/inaccessible/blk\0" "rwm\0"; + + const char *x, *y; + + NULSTR_FOREACH_PAIR(x, y, auto_devices) + (void) whitelist_device(prog, path, x, y); + + /* PTS (/dev/pts) devices may not be duplicated, but accessed */ + (void) whitelist_major(prog, path, "pts", 'c', "rw"); + } + + LIST_FOREACH(device_allow, a, c->device_allow) { + char acc[4], *val; + unsigned k = 0; + + if (a->r) + acc[k++] = 'r'; + if (a->w) + acc[k++] = 'w'; + if (a->m) + acc[k++] = 'm'; + + if (k == 0) + continue; + + acc[k++] = 0; + + if (path_startswith(a->path, "/dev/")) + (void) whitelist_device(prog, path, a->path, acc); + else if ((val = startswith(a->path, "block-"))) + (void) whitelist_major(prog, path, val, 'b', acc); + else if ((val = startswith(a->path, "char-"))) + (void) whitelist_major(prog, path, val, 'c', acc); + else + log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path); + } + + r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow); + if (r < 0) { + static bool warned = false; + + log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r, + "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n" + "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n" + "(This warning is only shown for the first loaded unit using device ACL.)", u->id); + + warned = true; + } + } + + if (apply_mask & CGROUP_MASK_PIDS) { + + if (is_host_root) { + /* So, the "pids" controller does not expose anything on the root cgroup, in order not to + * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when + * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a + * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take + * exclusive ownership of the sysctls, but we still want to honour things if the user sets + * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit + * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded) + * it also counts. But if the user never set a limit through us (i.e. we are the default of + * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on + * the first time we set a limit. Note that this boolean is flushed out on manager reload, + * which is desirable so that there's an offical way to release control of the sysctl from + * systemd: set the limit to unbounded and reload. */ + + if (c->tasks_max != CGROUP_LIMIT_MAX) { + u->manager->sysctl_pid_max_changed = true; + r = procfs_tasks_set_limit(c->tasks_max); + } else if (u->manager->sysctl_pid_max_changed) + r = procfs_tasks_set_limit(TASKS_MAX); + else + r = 0; + if (r < 0) + log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, + "Failed to write to tasks limit sysctls: %m"); + } + + /* The attribute itself is not available on the host root cgroup, and in the container case we want to + * leave it for the container manager. */ + if (!is_local_root) { + if (c->tasks_max != CGROUP_LIMIT_MAX) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + sprintf(buf, "%" PRIu64 "\n", c->tasks_max); + (void) set_attribute_and_warn(u, "pids", "pids.max", buf); + } else + (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n"); + } + } + + if (apply_mask & CGROUP_MASK_BPF_FIREWALL) + cgroup_apply_firewall(u); +} + +static bool unit_get_needs_bpf_firewall(Unit *u) { + CGroupContext *c; + Unit *p; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + if (c->ip_accounting || + c->ip_address_allow || + c->ip_address_deny) + return true; + + /* If any parent slice has an IP access list defined, it applies too */ + for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) { + c = unit_get_cgroup_context(p); + if (!c) + return false; + + if (c->ip_address_allow || + c->ip_address_deny) + return true; + } + + return false; +} + +static CGroupMask cgroup_context_get_mask(CGroupContext *c) { + CGroupMask mask = 0; + + /* Figure out which controllers we need, based on the cgroup context object */ + + if (c->cpu_accounting) + mask |= get_cpu_accounting_mask(); + + if (cgroup_context_has_cpu_weight(c) || + cgroup_context_has_cpu_shares(c) || + c->cpu_quota_per_sec_usec != USEC_INFINITY) + mask |= CGROUP_MASK_CPU; + + if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c)) + mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + + if (c->memory_accounting || + c->memory_limit != CGROUP_LIMIT_MAX || + cgroup_context_has_unified_memory_config(c)) + mask |= CGROUP_MASK_MEMORY; + + if (c->device_allow || + c->device_policy != CGROUP_AUTO) + mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES; + + if (c->tasks_accounting || + c->tasks_max != CGROUP_LIMIT_MAX) + mask |= CGROUP_MASK_PIDS; + + return CGROUP_MASK_EXTEND_JOINED(mask); +} + +static CGroupMask unit_get_bpf_mask(Unit *u) { + CGroupMask mask = 0; + + /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children + * too. */ + + if (unit_get_needs_bpf_firewall(u)) + mask |= CGROUP_MASK_BPF_FIREWALL; + + return mask; +} + +CGroupMask unit_get_own_mask(Unit *u) { + CGroupContext *c; + + /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty + * mask, as we shouldn't reflect it in the cgroup hierarchy then. */ + + if (u->load_state != UNIT_LOADED) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u); +} + +CGroupMask unit_get_delegate_mask(Unit *u) { + CGroupContext *c; + + /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the + * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers. + * + * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */ + + if (!unit_cgroup_delegate(u)) + return 0; + + if (cg_all_unified() <= 0) { + ExecContext *e; + + e = unit_get_exec_context(u); + if (e && !exec_context_maintains_privileges(e)) + return 0; + } + + assert_se(c = unit_get_cgroup_context(u)); + return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers); +} + +CGroupMask unit_get_members_mask(Unit *u) { + assert(u); + + /* Returns the mask of controllers all of the unit's children require, merged */ + + if (u->cgroup_members_mask_valid) + return u->cgroup_members_mask; /* Use cached value if possible */ + + u->cgroup_members_mask = 0; + + if (u->type == UNIT_SLICE) { + void *v; + Unit *member; + Iterator i; + + HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) { + if (UNIT_DEREF(member->slice) == u) + u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */ + } + } + + u->cgroup_members_mask_valid = true; + return u->cgroup_members_mask; +} + +CGroupMask unit_get_siblings_mask(Unit *u) { + assert(u); + + /* Returns the mask of controllers all of the unit's siblings + * require, i.e. the members mask of the unit's parent slice + * if there is one. */ + + if (UNIT_ISSET(u->slice)) + return unit_get_members_mask(UNIT_DEREF(u->slice)); + + return unit_get_subtree_mask(u); /* we are the top-level slice */ +} + +CGroupMask unit_get_disable_mask(Unit *u) { + CGroupContext *c; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + return c->disable_controllers; +} + +CGroupMask unit_get_ancestor_disable_mask(Unit *u) { + CGroupMask mask; + + assert(u); + mask = unit_get_disable_mask(u); + + /* Returns the mask of controllers which are marked as forcibly + * disabled in any ancestor unit or the unit in question. */ + + if (UNIT_ISSET(u->slice)) + mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice)); + + return mask; +} + +CGroupMask unit_get_subtree_mask(Unit *u) { + + /* Returns the mask of this subtree, meaning of the group + * itself and its children. */ + + return unit_get_own_mask(u) | unit_get_members_mask(u); +} + +CGroupMask unit_get_target_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for a specific cgroup, i.e. everything it needs itself, + * plus all that its children need, plus all that its siblings + * need. This is primarily useful on the legacy cgroup + * hierarchy, where we need to duplicate each cgroup in each + * hierarchy that shall be enabled for it. */ + + mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u); + mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); + + return mask; +} + +CGroupMask unit_get_enable_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for the children of a specific cgroup. This is primarily + * useful for the unified cgroup hierarchy, where each cgroup + * controls which controllers are enabled for its children. */ + + mask = unit_get_members_mask(u); + mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); + + return mask; +} + +void unit_invalidate_cgroup_members_masks(Unit *u) { + assert(u); + + /* Recurse invalidate the member masks cache all the way up the tree */ + u->cgroup_members_mask_valid = false; + + if (UNIT_ISSET(u->slice)) + unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice)); +} + +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { + + /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */ + + while (u) { + + if (u->cgroup_path && + u->cgroup_realized && + FLAGS_SET(u->cgroup_realized_mask, mask)) + return u->cgroup_path; + + u = UNIT_DEREF(u->slice); + } + + return NULL; +} + +static const char *migrate_callback(CGroupMask mask, void *userdata) { + return unit_get_realized_cgroup_path(userdata, mask); +} + +char *unit_default_cgroup_path(const Unit *u) { + _cleanup_free_ char *escaped = NULL, *slice = NULL; + int r; + + assert(u); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return strdup(u->manager->cgroup_root); + + if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) { + r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice); + if (r < 0) + return NULL; + } + + escaped = cg_escape(u->id); + if (!escaped) + return NULL; + + if (slice) + return strjoin(u->manager->cgroup_root, "/", slice, "/", + escaped); + else + return strjoin(u->manager->cgroup_root, "/", escaped); +} + +int unit_set_cgroup_path(Unit *u, const char *path) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + if (path) { + p = strdup(path); + if (!p) + return -ENOMEM; + } else + p = NULL; + + if (streq_ptr(u->cgroup_path, p)) + return 0; + + if (p) { + r = hashmap_put(u->manager->cgroup_unit, p, u); + if (r < 0) + return r; + } + + unit_release_cgroup(u); + + u->cgroup_path = TAKE_PTR(p); + + return 1; +} + +int unit_watch_cgroup(Unit *u) { + _cleanup_free_ char *events = NULL; + int r; + + assert(u); + + if (!u->cgroup_path) + return 0; + + if (u->cgroup_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m"); + if (r == 0) + return 0; + + /* Don't watch the root slice, it's pointless. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events); + if (r < 0) + return log_oom(); + + u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already + * gone we don't need to track + * it, so this is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path); + } + + r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m"); + + return 0; +} + +int unit_pick_cgroup_path(Unit *u) { + _cleanup_free_ char *path = NULL; + int r; + + assert(u); + + if (u->cgroup_path) + return 0; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + path = unit_default_cgroup_path(u); + if (!path) + return log_oom(); + + r = unit_set_cgroup_path(u, path); + if (r == -EEXIST) + return log_unit_error_errno(u, r, "Control group %s exists already.", path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path); + + return 0; +} + +static int unit_create_cgroup( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask, + ManagerState state) { + + bool created; + int r; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* Figure out our cgroup path */ + r = unit_pick_cgroup_path(u); + if (r < 0) + return r; + + /* First, create our own group */ + r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path); + created = r; + + /* Start watching it */ + (void) unit_watch_cgroup(u); + + /* Preserve enabled controllers in delegated units, adjust others. */ + if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { + CGroupMask result_mask = 0; + + /* Enable all controllers we need */ + r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path); + + /* If we just turned off a controller, this might release the controller for our parent too, let's + * enqueue the parent for re-realization in that case again. */ + if (UNIT_ISSET(u->slice)) { + CGroupMask turned_off; + + turned_off = (u->cgroup_realized ? u->cgroup_enabled_mask & ~result_mask : 0); + if (turned_off != 0) { + Unit *parent; + + /* Force the parent to propagate the enable mask to the kernel again, by invalidating + * the controller we just turned off. */ + + for (parent = UNIT_DEREF(u->slice); parent; parent = UNIT_DEREF(parent->slice)) + unit_invalidate_cgroup(parent, turned_off); + } + } + + /* Remember what's actually enabled now */ + u->cgroup_enabled_mask = result_mask; + } + + /* Keep track that this is now realized */ + u->cgroup_realized = true; + u->cgroup_realized_mask = target_mask; + + if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) { + + /* Then, possibly move things over, but not if + * subgroups may contain processes, which is the case + * for slice and delegation units. */ + r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path); + } + + /* Set attributes */ + cgroup_context_apply(u, target_mask, state); + cgroup_xattr_apply(u); + + return 0; +} + +static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + char *pp; + int r; + + assert(u); + + if (MANAGER_IS_SYSTEM(u->manager)) + return -EINVAL; + + if (!u->manager->system_bus) + return -EIO; + + if (!u->cgroup_path) + return -EINVAL; + + /* Determine this unit's cgroup path relative to our cgroup root */ + pp = path_startswith(u->cgroup_path, u->manager->cgroup_root); + if (!pp) + return -EINVAL; + + pp = strjoina("/", pp, suffix_path); + path_simplify(pp, false); + + r = sd_bus_call_method(u->manager->system_bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "AttachProcessesToUnit", + &error, NULL, + "ssau", + NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r)); + + return 0; +} + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { + CGroupMask delegated_mask; + const char *p; + Iterator i; + void *pidp; + int r, q; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + if (set_isempty(pids)) + return 0; + + r = unit_realize_cgroup(u); + if (r < 0) + return r; + + if (isempty(suffix_path)) + p = u->cgroup_path; + else + p = strjoina(u->cgroup_path, "/", suffix_path); + + delegated_mask = unit_get_delegate_mask(u); + + r = 0; + SET_FOREACH(pidp, pids, i) { + pid_t pid = PTR_TO_PID(pidp); + CGroupController c; + + /* First, attach the PID to the main cgroup hierarchy */ + q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid); + if (q < 0) { + log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p); + + if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) { + int z; + + /* If we are in a user instance, and we can't move the process ourselves due to + * permission problems, let's ask the system instance about it instead. Since it's more + * privileged it might be able to move the process across the leaves of a subtree who's + * top node is not owned by us. */ + + z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path); + if (z < 0) + log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p); + else + continue; /* When the bus thing worked via the bus we are fully done for this PID. */ + } + + if (r >= 0) + r = q; /* Remember first error */ + + continue; + } + + q = cg_all_unified(); + if (q < 0) + return q; + if (q > 0) + continue; + + /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the + * innermost realized one */ + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *realized; + + if (!(u->manager->cgroup_supported & bit)) + continue; + + /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */ + if (delegated_mask & u->cgroup_realized_mask & bit) { + q = cg_attach(cgroup_controller_to_string(c), p, pid); + if (q >= 0) + continue; /* Success! */ + + log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m", + pid, p, cgroup_controller_to_string(c)); + } + + /* So this controller is either not delegate or realized, or something else weird happened. In + * that case let's attach the PID at least to the closest cgroup up the tree that is + * realized. */ + realized = unit_get_realized_cgroup_path(u, bit); + if (!realized) + continue; /* Not even realized in the root slice? Then let's not bother */ + + q = cg_attach(cgroup_controller_to_string(c), realized, pid); + if (q < 0) + log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m", + pid, realized, cgroup_controller_to_string(c)); + } + } + + return r; +} + +static bool unit_has_mask_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if this unit is fully realized. We check four things: + * + * 1. Whether the cgroup was created at all + * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1) + * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2) + * 4. Whether the invalidation mask is currently zero + * + * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note + * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for + * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask + * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they + * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are + * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they + * simply don't matter. */ + + return u->cgroup_realized && + ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && + ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && + u->cgroup_invalidated_mask == 0; +} + +static bool unit_has_mask_disables_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if all controllers which should be disabled are indeed disabled. + * + * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is + * already removed. */ + + return !u->cgroup_realized || + (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) && + FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2)); +} + +static bool unit_has_mask_enables_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if all controllers which should be enabled are indeed enabled. + * + * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything + * we want to add is already added. */ + + return u->cgroup_realized && + ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) && + ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2); +} + +void unit_add_to_cgroup_realize_queue(Unit *u) { + assert(u); + + if (u->in_cgroup_realize_queue) + return; + + LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + u->in_cgroup_realize_queue = true; +} + +static void unit_remove_from_cgroup_realize_queue(Unit *u) { + assert(u); + + if (!u->in_cgroup_realize_queue) + return; + + LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + u->in_cgroup_realize_queue = false; +} + +/* Controllers can only be enabled breadth-first, from the root of the + * hierarchy downwards to the unit in question. */ +static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) { + CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; + int r; + + assert(u); + + /* First go deal with this unit's parent, or we won't be able to enable + * any new controllers at this layer. */ + if (UNIT_ISSET(u->slice)) { + r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state); + if (r < 0) + return r; + } + + target_mask = unit_get_target_mask(u); + enable_mask = unit_get_enable_mask(u); + + /* We can only enable in this direction, don't try to disable anything. + */ + if (unit_has_mask_enables_realized(u, target_mask, enable_mask)) + return 0; + + new_target_mask = u->cgroup_realized_mask | target_mask; + new_enable_mask = u->cgroup_enabled_mask | enable_mask; + + return unit_create_cgroup(u, new_target_mask, new_enable_mask, state); +} + +/* Controllers can only be disabled depth-first, from the leaves of the + * hierarchy upwards to the unit in question. */ +static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { + Iterator i; + Unit *m; + void *v; + + assert(u); + + if (u->type != UNIT_SLICE) + return 0; + + HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) { + CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; + int r; + + if (UNIT_DEREF(m->slice) != u) + continue; + + /* The cgroup for this unit might not actually be fully + * realised yet, in which case it isn't holding any controllers + * open anyway. */ + if (!m->cgroup_path) + continue; + + /* We must disable those below us first in order to release the + * controller. */ + if (m->type == UNIT_SLICE) + (void) unit_realize_cgroup_now_disable(m, state); + + target_mask = unit_get_target_mask(m); + enable_mask = unit_get_enable_mask(m); + + /* We can only disable in this direction, don't try to enable + * anything. */ + if (unit_has_mask_disables_realized(m, target_mask, enable_mask)) + continue; + + new_target_mask = m->cgroup_realized_mask & target_mask; + new_enable_mask = m->cgroup_enabled_mask & enable_mask; + + r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state); + if (r < 0) + return r; + } + + return 0; +} + +/* Check if necessary controllers and attributes for a unit are in place. + * + * - If so, do nothing. + * - If not, create paths, move processes over, and set attributes. + * + * Controllers can only be *enabled* in a breadth-first way, and *disabled* in + * a depth-first way. As such the process looks like this: + * + * Suppose we have a cgroup hierarchy which looks like this: + * + * root + * / \ + * / \ + * / \ + * a b + * / \ / \ + * / \ / \ + * c d e f + * / \ / \ / \ / \ + * h i j k l m n o + * + * 1. We want to realise cgroup "d" now. + * 2. cgroup "a" has DisableControllers=cpu in the associated unit. + * 3. cgroup "k" just started requesting the memory controller. + * + * To make this work we must do the following in order: + * + * 1. Disable CPU controller in k, j + * 2. Disable CPU controller in d + * 3. Enable memory controller in root + * 4. Enable memory controller in a + * 5. Enable memory controller in d + * 6. Enable memory controller in k + * + * Notice that we need to touch j in one direction, but not the other. We also + * don't go beyond d when disabling -- it's up to "a" to get realized if it + * wants to disable further. The basic rules are therefore: + * + * - If you're disabling something, you need to realise all of the cgroups from + * your recursive descendants to the root. This starts from the leaves. + * - If you're enabling something, you need to realise from the root cgroup + * downwards, but you don't need to iterate your recursive descendants. + * + * Returns 0 on success and < 0 on failure. */ +static int unit_realize_cgroup_now(Unit *u, ManagerState state) { + CGroupMask target_mask, enable_mask; + int r; + + assert(u); + + unit_remove_from_cgroup_realize_queue(u); + + target_mask = unit_get_target_mask(u); + enable_mask = unit_get_enable_mask(u); + + if (unit_has_mask_realized(u, target_mask, enable_mask)) + return 0; + + /* Disable controllers below us, if there are any */ + r = unit_realize_cgroup_now_disable(u, state); + if (r < 0) + return r; + + /* Enable controllers above us, if there are any */ + if (UNIT_ISSET(u->slice)) { + r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state); + if (r < 0) + return r; + } + + /* Now actually deal with the cgroup we were trying to realise and set attributes */ + r = unit_create_cgroup(u, target_mask, enable_mask, state); + if (r < 0) + return r; + + /* Now, reset the invalidation mask */ + u->cgroup_invalidated_mask = 0; + return 0; +} + +unsigned manager_dispatch_cgroup_realize_queue(Manager *m) { + ManagerState state; + unsigned n = 0; + Unit *i; + int r; + + assert(m); + + state = manager_state(m); + + while ((i = m->cgroup_realize_queue)) { + assert(i->in_cgroup_realize_queue); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) { + /* Maybe things changed, and the unit is not actually active anymore? */ + unit_remove_from_cgroup_realize_queue(i); + continue; + } + + r = unit_realize_cgroup_now(i, state); + if (r < 0) + log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id); + + n++; + } + + return n; +} + +static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) { + Unit *slice; + + /* This adds the siblings of the specified unit and the + * siblings of all parent units to the cgroup queue. (But + * neither the specified unit itself nor the parents.) */ + + while ((slice = UNIT_DEREF(u->slice))) { + Iterator i; + Unit *m; + void *v; + + HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) { + /* Skip units that have a dependency on the slice + * but aren't actually in it. */ + if (UNIT_DEREF(m->slice) != slice) + continue; + + /* No point in doing cgroup application for units + * without active processes. */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m))) + continue; + + /* If the unit doesn't need any new controllers + * and has current ones realized, it doesn't need + * any changes. */ + if (unit_has_mask_realized(m, + unit_get_target_mask(m), + unit_get_enable_mask(m))) + continue; + + unit_add_to_cgroup_realize_queue(m); + } + + u = slice; + } +} + +int unit_realize_cgroup(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* So, here's the deal: when realizing the cgroups for this + * unit, we need to first create all parents, but there's more + * actually: for the weight-based controllers we also need to + * make sure that all our siblings (i.e. units that are in the + * same slice as we are) have cgroups, too. Otherwise, things + * would become very uneven as each of their processes would + * get as much resources as all our group together. This call + * will synchronously create the parent cgroups, but will + * defer work on the siblings to the next event loop + * iteration. */ + + /* Add all sibling slices to the cgroup queue. */ + unit_add_siblings_to_cgroup_realize_queue(u); + + /* And realize this one now (and apply the values) */ + return unit_realize_cgroup_now(u, manager_state(u->manager)); +} + +void unit_release_cgroup(Unit *u) { + assert(u); + + /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call + * when we close down everything for reexecution, where we really want to leave the cgroup in place. */ + + if (u->cgroup_path) { + (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); + u->cgroup_path = mfree(u->cgroup_path); + } + + if (u->cgroup_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring: %m", u->cgroup_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd)); + u->cgroup_inotify_wd = -1; + } +} + +void unit_prune_cgroup(Unit *u) { + int r; + bool is_root_slice; + + assert(u); + + /* Removes the cgroup, if empty and possible, and stops watching it. */ + + if (!u->cgroup_path) + return; + + (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */ + + is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); + + r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path); + return; + } + + if (is_root_slice) + return; + + unit_release_cgroup(u); + + u->cgroup_realized = false; + u->cgroup_realized_mask = 0; + u->cgroup_enabled_mask = 0; + + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); +} + +int unit_search_main_pid(Unit *u, pid_t *ret) { + _cleanup_fclose_ FILE *f = NULL; + pid_t pid = 0, npid, mypid; + int r; + + assert(u); + assert(ret); + + if (!u->cgroup_path) + return -ENXIO; + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f); + if (r < 0) + return r; + + mypid = getpid_cached(); + while (cg_read_pid(f, &npid) > 0) { + pid_t ppid; + + if (npid == pid) + continue; + + /* Ignore processes that aren't our kids */ + if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid) + continue; + + if (pid != 0) + /* Dang, there's more than one daemonized PID + in this group, so we don't know what process + is the main process. */ + + return -ENODATA; + + pid = npid; + } + + *ret = pid; + return 0; +} + +static int unit_watch_pids_in_path(Unit *u, const char *path) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int ret = 0, r; + + assert(u); + assert(path); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f); + if (r < 0) + ret = r; + else { + pid_t pid; + + while ((r = cg_read_pid(f, &pid)) > 0) { + r = unit_watch_pid(u, pid); + if (r < 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) { + if (ret >= 0) + ret = r; + } else { + char *fn; + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = strjoin(path, "/", fn); + free(fn); + + if (!p) + return -ENOMEM; + + r = unit_watch_pids_in_path(u, p); + if (r < 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + } + + return ret; +} + +int unit_synthesize_cgroup_empty_event(Unit *u) { + int r; + + assert(u); + + /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility + * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can + * get as notification source as soon as we stopped having any useful PIDs to watch for. */ + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we have reliable notifications, and don't need this */ + return 0; + + if (!set_isempty(u->pids)) + return 0; + + unit_add_to_cgroup_empty_queue(u); + return 0; +} + +int unit_watch_all_pids(Unit *u) { + int r; + + assert(u); + + /* Adds all PIDs from our cgroup to the set of PIDs we + * watch. This is a fallback logic for cases where we do not + * get reliable cgroup empty notifications: we try to use + * SIGCHLD as replacement. */ + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + return unit_watch_pids_in_path(u, u->cgroup_path); +} + +static int on_cgroup_empty_event(sd_event_source *s, void *userdata) { + Manager *m = userdata; + Unit *u; + int r; + + assert(s); + assert(m); + + u = m->cgroup_empty_queue; + if (!u) + return 0; + + assert(u->in_cgroup_empty_queue); + u->in_cgroup_empty_queue = false; + LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u); + + if (m->cgroup_empty_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m"); + } + + unit_add_to_gc_queue(u); + + if (UNIT_VTABLE(u)->notify_cgroup_empty) + UNIT_VTABLE(u)->notify_cgroup_empty(u); + + return 0; +} + +void unit_add_to_cgroup_empty_queue(Unit *u) { + int r; + + assert(u); + + /* Note that there are four different ways how cgroup empty events reach us: + * + * 1. On the unified hierarchy we get an inotify event on the cgroup + * + * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket + * + * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus + * + * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as + * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications. + * + * Regardless which way we got the notification, we'll verify it here, and then add it to a separate + * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use + * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending + * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the + * case for scope units). */ + + if (u->in_cgroup_empty_queue) + return; + + /* Let's verify that the cgroup is really empty */ + if (!u->cgroup_path) + return; + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path); + return; + } + if (r == 0) + return; + + LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + u->in_cgroup_empty_queue = true; + + /* Trigger the defer event */ + r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to enable cgroup empty event source: %m"); +} + +static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(s); + assert(fd >= 0); + assert(m); + + for (;;) { + union inotify_event_buffer buffer; + struct inotify_event *e; + ssize_t l; + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + return log_error_errno(errno, "Failed to read control group inotify events: %m"); + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + Unit *u; + + if (e->wd < 0) + /* Queue overflow has no watch descriptor */ + continue; + + if (e->mask & IN_IGNORED) + /* The watch was just removed */ + continue; + + u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (!u) /* Not that inotify might deliver + * events for a watch even after it + * was removed, because it was queued + * before the removal. Let's ignore + * this here safely. */ + continue; + + unit_add_to_cgroup_empty_queue(u); + } + } +} + +static int cg_bpf_mask_supported(CGroupMask *ret) { + CGroupMask mask = 0; + int r; + + /* BPF-based firewall */ + r = bpf_firewall_supported(); + if (r > 0) + mask |= CGROUP_MASK_BPF_FIREWALL; + + /* BPF-based device access control */ + r = bpf_devices_supported(); + if (r > 0) + mask |= CGROUP_MASK_BPF_DEVICES; + + *ret = mask; + return 0; +} + +int manager_setup_cgroup(Manager *m) { + _cleanup_free_ char *path = NULL; + const char *scope_path; + CGroupController c; + int r, all_unified; + CGroupMask mask; + char *e; + + assert(m); + + /* 1. Determine hierarchy */ + m->cgroup_root = mfree(m->cgroup_root); + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root); + if (r < 0) + return log_error_errno(r, "Cannot determine cgroup we are running in: %m"); + + /* Chop off the init scope, if we are already located in it */ + e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + + /* LEGACY: Also chop off the system slice if we are in + * it. This is to support live upgrades from older systemd + * versions where PID 1 was moved there. Also see + * cg_get_root_path(). */ + if (!e && MANAGER_IS_SYSTEM(m)) { + e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE); + if (!e) + e = endswith(m->cgroup_root, "/system"); /* even more legacy */ + } + if (e) + *e = 0; + + /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can + * easily prepend it everywhere. */ + delete_trailing_chars(m->cgroup_root, "/"); + + /* 2. Show data */ + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path); + if (r < 0) + return log_error_errno(r, "Cannot find cgroup mount point: %m"); + + r = cg_unified_flush(); + if (r < 0) + return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m"); + + all_unified = cg_all_unified(); + if (all_unified < 0) + return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m"); + if (all_unified > 0) + log_debug("Unified cgroup hierarchy is located at %s.", path); + else { + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m"); + if (r > 0) + log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path); + else + log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path); + } + + /* 3. Allocate cgroup empty defer event source */ + m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); + r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m); + if (r < 0) + return log_error_errno(r, "Failed to create cgroup empty event source: %m"); + + r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m"); + + r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to disable cgroup empty event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty"); + + /* 4. Install notifier inotify object, or agent */ + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { + + /* In the unified hierarchy we can get cgroup empty notifications via inotify. */ + + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + safe_close(m->cgroup_inotify_fd); + + m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->cgroup_inotify_fd < 0) + return log_error_errno(errno, "Failed to create control group inotify object: %m"); + + r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m); + if (r < 0) + return log_error_errno(r, "Failed to watch control group inotify object: %m"); + + /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also + * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4); + if (r < 0) + return log_error_errno(r, "Failed to set priority of inotify event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); + + } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) { + + /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable, + * since it does not generate events when control groups with children run empty. */ + + r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH); + if (r < 0) + log_warning_errno(r, "Failed to install release agent, ignoring: %m"); + else if (r > 0) + log_debug("Installed release agent."); + else if (r == 0) + log_debug("Release agent already installed."); + } + + /* 5. Make sure we are in the special "init.scope" unit in the root slice. */ + scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r >= 0) { + /* Also, move all other userspace processes remaining in the root cgroup into that scope. */ + r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r < 0) + log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m"); + + /* 6. And pin it, so that it cannot be unmounted */ + safe_close(m->pin_cgroupfs_fd); + m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK); + if (m->pin_cgroupfs_fd < 0) + return log_error_errno(errno, "Failed to open pin file: %m"); + + } else if (!MANAGER_IS_TEST_RUN(m)) + return log_error_errno(r, "Failed to create %s control group: %m", scope_path); + + /* 7. Always enable hierarchical support if it exists... */ + if (!all_unified && !MANAGER_IS_TEST_RUN(m)) + (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); + + /* 8. Figure out which controllers are supported */ + r = cg_mask_supported(&m->cgroup_supported); + if (r < 0) + return log_error_errno(r, "Failed to determine supported controllers: %m"); + + /* 9. Figure out which bpf-based pseudo-controllers are supported */ + r = cg_bpf_mask_supported(&mask); + if (r < 0) + return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m"); + m->cgroup_supported |= mask; + + /* 10. Log which controllers are supported */ + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) + log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c))); + + return 0; +} + +void manager_shutdown_cgroup(Manager *m, bool delete) { + assert(m); + + /* We can't really delete the group, since we are in it. But + * let's trim it. */ + if (delete && m->cgroup_root && m->test_run_flags != MANAGER_TEST_RUN_MINIMAL) + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false); + + m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); + + m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit); + + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); + + m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd); + + m->cgroup_root = mfree(m->cgroup_root); +} + +Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { + char *p; + Unit *u; + + assert(m); + assert(cgroup); + + u = hashmap_get(m->cgroup_unit, cgroup); + if (u) + return u; + + p = strdupa(cgroup); + for (;;) { + char *e; + + e = strrchr(p, '/'); + if (!e || e == p) + return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE); + + *e = 0; + + u = hashmap_get(m->cgroup_unit, p); + if (u) + return u; + } +} + +Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) { + _cleanup_free_ char *cgroup = NULL; + + assert(m); + + if (!pid_is_valid(pid)) + return NULL; + + if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0) + return NULL; + + return manager_get_unit_by_cgroup(m, cgroup); +} + +Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) { + Unit *u, **array; + + assert(m); + + /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most + * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most + * relevant one as children of the process will be assigned to that one, too, before all else. */ + + if (!pid_is_valid(pid)) + return NULL; + + if (pid == getpid_cached()) + return hashmap_get(m->units, SPECIAL_INIT_SCOPE); + + u = manager_get_unit_by_pid_cgroup(m, pid); + if (u) + return u; + + u = hashmap_get(m->watch_pids, PID_TO_PTR(pid)); + if (u) + return u; + + array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid)); + if (array) + return array[0]; + + return NULL; +} + +int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { + Unit *u; + + assert(m); + assert(cgroup); + + /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process + * or from the --system instance */ + + log_debug("Got cgroup empty notification for: %s", cgroup); + + u = manager_get_unit_by_cgroup(m, cgroup); + if (!u) + return 0; + + unit_add_to_cgroup_empty_queue(u); + return 1; +} + +int unit_get_memory_current(Unit *u, uint64_t *ret) { + _cleanup_free_ char *v = NULL; + int r; + + assert(u); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, memory_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_memory_get_used(ret); + + if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v); + else + r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + return safe_atou64(v, ret); +} + +int unit_get_tasks_current(Unit *u, uint64_t *ret) { + _cleanup_free_ char *v = NULL; + int r; + + assert(u); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, tasks_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_tasks_get_current(ret); + + if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0) + return -ENODATA; + + r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + return safe_atou64(v, ret); +} + +static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { + _cleanup_free_ char *v = NULL; + uint64_t ns; + int r; + + assert(u); + assert(ret); + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_cpu_get_usage(ret); + + /* Requisite controllers for CPU accounting are not enabled */ + if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) { + _cleanup_free_ char *val = NULL; + uint64_t us; + + r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val); + if (IN_SET(r, -ENOENT, -ENXIO)) + return -ENODATA; + if (r < 0) + return r; + + r = safe_atou64(val, &us); + if (r < 0) + return r; + + ns = us * NSEC_PER_USEC; + } else { + r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + r = safe_atou64(v, &ns); + if (r < 0) + return r; + } + + *ret = ns; + return 0; +} + +int unit_get_cpu_usage(Unit *u, nsec_t *ret) { + nsec_t ns; + int r; + + assert(u); + + /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was + * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply + * call this function with a NULL return value. */ + + if (!UNIT_CGROUP_BOOL(u, cpu_accounting)) + return -ENODATA; + + r = unit_get_cpu_usage_raw(u, &ns); + if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) { + /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our + * cached value. */ + + if (ret) + *ret = u->cpu_usage_last; + return 0; + } + if (r < 0) + return r; + + if (ns > u->cpu_usage_base) + ns -= u->cpu_usage_base; + else + ns = 0; + + u->cpu_usage_last = ns; + if (ret) + *ret = ns; + + return 0; +} + +int unit_get_ip_accounting( + Unit *u, + CGroupIPAccountingMetric metric, + uint64_t *ret) { + + uint64_t value; + int fd, r; + + assert(u); + assert(metric >= 0); + assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, ip_accounting)) + return -ENODATA; + + fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + if (fd < 0) + return -ENODATA; + + if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) + r = bpf_firewall_read_accounting(fd, &value, NULL); + else + r = bpf_firewall_read_accounting(fd, NULL, &value); + if (r < 0) + return r; + + /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile + * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the + * ip_accounting_extra[] field, and add them in here transparently. */ + + *ret = value + u->ip_accounting_extra[metric]; + + return r; +} + +int unit_reset_cpu_accounting(Unit *u) { + nsec_t ns; + int r; + + assert(u); + + u->cpu_usage_last = NSEC_INFINITY; + + r = unit_get_cpu_usage_raw(u, &ns); + if (r < 0) { + u->cpu_usage_base = 0; + return r; + } + + u->cpu_usage_base = ns; + return 0; +} + +int unit_reset_ip_accounting(Unit *u) { + int r = 0, q = 0; + + assert(u); + + if (u->ip_accounting_ingress_map_fd >= 0) + r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd); + + if (u->ip_accounting_egress_map_fd >= 0) + q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd); + + zero(u->ip_accounting_extra); + + return r < 0 ? r : q; +} + +void unit_invalidate_cgroup(Unit *u, CGroupMask m) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (m == 0) + return; + + /* always invalidate compat pairs together */ + if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)) + m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + + if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT)) + m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT; + + if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */ + return; + + u->cgroup_invalidated_mask |= m; + unit_add_to_cgroup_realize_queue(u); +} + +void unit_invalidate_cgroup_bpf(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */ + return; + + u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + unit_add_to_cgroup_realize_queue(u); + + /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access + * list of our children includes our own. */ + if (u->type == UNIT_SLICE) { + Unit *member; + Iterator i; + void *v; + + HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) { + if (UNIT_DEREF(member->slice) == u) + unit_invalidate_cgroup_bpf(member); + } + } +} + +bool unit_cgroup_delegate(Unit *u) { + CGroupContext *c; + + assert(u); + + if (!UNIT_VTABLE(u)->can_delegate) + return false; + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return c->delegate; +} + +void manager_invalidate_startup_units(Manager *m) { + Iterator i; + Unit *u; + + assert(m); + + SET_FOREACH(u, m->startup_units, i) + unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO); +} + +static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { + [CGROUP_AUTO] = "auto", + [CGROUP_CLOSED] = "closed", + [CGROUP_STRICT] = "strict", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); diff --git a/src/core/cgroup.h b/src/core/cgroup.h new file mode 100644 index 0000000..266daa2 --- /dev/null +++ b/src/core/cgroup.h @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> + +#include "cgroup-util.h" +#include "ip-address-access.h" +#include "list.h" +#include "time-util.h" + +typedef struct CGroupContext CGroupContext; +typedef struct CGroupDeviceAllow CGroupDeviceAllow; +typedef struct CGroupIODeviceWeight CGroupIODeviceWeight; +typedef struct CGroupIODeviceLimit CGroupIODeviceLimit; +typedef struct CGroupIODeviceLatency CGroupIODeviceLatency; +typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight; +typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth; + +typedef enum CGroupDevicePolicy { + + /* When devices listed, will allow those, plus built-in ones, + if none are listed will allow everything. */ + CGROUP_AUTO, + + /* Everything forbidden, except built-in ones and listed ones. */ + CGROUP_CLOSED, + + /* Everythings forbidden, except for the listed devices */ + CGROUP_STRICT, + + _CGROUP_DEVICE_POLICY_MAX, + _CGROUP_DEVICE_POLICY_INVALID = -1 +} CGroupDevicePolicy; + +struct CGroupDeviceAllow { + LIST_FIELDS(CGroupDeviceAllow, device_allow); + char *path; + bool r:1; + bool w:1; + bool m:1; +}; + +struct CGroupIODeviceWeight { + LIST_FIELDS(CGroupIODeviceWeight, device_weights); + char *path; + uint64_t weight; +}; + +struct CGroupIODeviceLimit { + LIST_FIELDS(CGroupIODeviceLimit, device_limits); + char *path; + uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; +}; + +struct CGroupIODeviceLatency { + LIST_FIELDS(CGroupIODeviceLatency, device_latencies); + char *path; + usec_t target_usec; +}; + +struct CGroupBlockIODeviceWeight { + LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights); + char *path; + uint64_t weight; +}; + +struct CGroupBlockIODeviceBandwidth { + LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths); + char *path; + uint64_t rbps; + uint64_t wbps; +}; + +struct CGroupContext { + bool cpu_accounting; + bool io_accounting; + bool blockio_accounting; + bool memory_accounting; + bool tasks_accounting; + bool ip_accounting; + + /* For unified hierarchy */ + uint64_t cpu_weight; + uint64_t startup_cpu_weight; + usec_t cpu_quota_per_sec_usec; + + uint64_t io_weight; + uint64_t startup_io_weight; + LIST_HEAD(CGroupIODeviceWeight, io_device_weights); + LIST_HEAD(CGroupIODeviceLimit, io_device_limits); + LIST_HEAD(CGroupIODeviceLatency, io_device_latencies); + + uint64_t memory_min; + uint64_t memory_low; + uint64_t memory_high; + uint64_t memory_max; + uint64_t memory_swap_max; + + LIST_HEAD(IPAddressAccessItem, ip_address_allow); + LIST_HEAD(IPAddressAccessItem, ip_address_deny); + + /* For legacy hierarchies */ + uint64_t cpu_shares; + uint64_t startup_cpu_shares; + + uint64_t blockio_weight; + uint64_t startup_blockio_weight; + LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights); + LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths); + + uint64_t memory_limit; + + CGroupDevicePolicy device_policy; + LIST_HEAD(CGroupDeviceAllow, device_allow); + + /* Common */ + uint64_t tasks_max; + + bool delegate; + CGroupMask delegate_controllers; + + CGroupMask disable_controllers; +}; + +/* Used when querying IP accounting data */ +typedef enum CGroupIPAccountingMetric { + CGROUP_IP_INGRESS_BYTES, + CGROUP_IP_INGRESS_PACKETS, + CGROUP_IP_EGRESS_BYTES, + CGROUP_IP_EGRESS_PACKETS, + _CGROUP_IP_ACCOUNTING_METRIC_MAX, + _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -1, +} CGroupIPAccountingMetric; + +typedef struct Unit Unit; +typedef struct Manager Manager; + +void cgroup_context_init(CGroupContext *c); +void cgroup_context_done(CGroupContext *c); +void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix); + +void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a); +void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w); +void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l); +void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l); +void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w); +void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b); + +int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); + +CGroupMask unit_get_own_mask(Unit *u); +CGroupMask unit_get_delegate_mask(Unit *u); +CGroupMask unit_get_members_mask(Unit *u); +CGroupMask unit_get_siblings_mask(Unit *u); +CGroupMask unit_get_subtree_mask(Unit *u); +CGroupMask unit_get_disable_mask(Unit *u); +CGroupMask unit_get_ancestor_disable_mask(Unit *u); + +CGroupMask unit_get_target_mask(Unit *u); +CGroupMask unit_get_enable_mask(Unit *u); + +void unit_invalidate_cgroup_members_masks(Unit *u); + +void unit_add_to_cgroup_realize_queue(Unit *u); + +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); +char *unit_default_cgroup_path(const Unit *u); +int unit_set_cgroup_path(Unit *u, const char *path); +int unit_pick_cgroup_path(Unit *u); + +int unit_realize_cgroup(Unit *u); +void unit_release_cgroup(Unit *u); +void unit_prune_cgroup(Unit *u); +int unit_watch_cgroup(Unit *u); + +void unit_add_to_cgroup_empty_queue(Unit *u); + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path); + +int manager_setup_cgroup(Manager *m); +void manager_shutdown_cgroup(Manager *m, bool delete); + +unsigned manager_dispatch_cgroup_realize_queue(Manager *m); + +Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup); +Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid); +Unit* manager_get_unit_by_pid(Manager *m, pid_t pid); + +int unit_search_main_pid(Unit *u, pid_t *ret); +int unit_watch_all_pids(Unit *u); + +int unit_synthesize_cgroup_empty_event(Unit *u); + +int unit_get_memory_current(Unit *u, uint64_t *ret); +int unit_get_tasks_current(Unit *u, uint64_t *ret); +int unit_get_cpu_usage(Unit *u, nsec_t *ret); +int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); + +int unit_reset_cpu_accounting(Unit *u); +int unit_reset_ip_accounting(Unit *u); + +#define UNIT_CGROUP_BOOL(u, name) \ + ({ \ + CGroupContext *cc = unit_get_cgroup_context(u); \ + cc ? cc->name : false; \ + }) + +bool manager_owns_host_root_cgroup(Manager *m); +bool unit_has_host_root_cgroup(Unit *u); + +int manager_notify_cgroup_empty(Manager *m, const char *group); + +void unit_invalidate_cgroup(Unit *u, CGroupMask m); +void unit_invalidate_cgroup_bpf(Unit *u); + +void manager_invalidate_startup_units(Manager *m); + +const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_; +CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_; + +bool unit_cgroup_delegate(Unit *u); diff --git a/src/core/chown-recursive.c b/src/core/chown-recursive.c new file mode 100644 index 0000000..7767301 --- /dev/null +++ b/src/core/chown-recursive.c @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/xattr.h> + +#include "chown-recursive.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "macro.h" +#include "stdio-util.h" +#include "strv.h" +#include "user-util.h" + +static int chown_one(int fd, const struct stat *st, uid_t uid, gid_t gid) { + char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + const char *n; + + assert(fd >= 0); + assert(st); + + if ((!uid_is_valid(uid) || st->st_uid == uid) && + (!gid_is_valid(gid) || st->st_gid == gid)) + return 0; + + /* We change ownership through the /proc/self/fd/%i path, so that we have a stable reference that works with + * O_PATH. (Note: fchown() and fchmod() do not work with O_PATH, the kernel refuses that. */ + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + + /* Drop any ACL if there is one */ + FOREACH_STRING(n, "system.posix_acl_access", "system.posix_acl_default") + if (removexattr(procfs_path, n) < 0) + if (!IN_SET(errno, ENODATA, EOPNOTSUPP, ENOSYS, ENOTTY)) + return -errno; + + if (chown(procfs_path, uid, gid) < 0) + return -errno; + + /* The linux kernel alters the mode in some cases of chown(), as well when we change ACLs. Let's undo this. We + * do this only for non-symlinks however. That's because for symlinks the access mode is ignored anyway and + * because on some kernels/file systems trying to change the access mode will succeed but has no effect while + * on others it actively fails. */ + if (!S_ISLNK(st->st_mode)) + if (chmod(procfs_path, st->st_mode & 07777) < 0) + return -errno; + + return 1; +} + +static int chown_recursive_internal(int fd, const struct stat *st, uid_t uid, gid_t gid) { + _cleanup_closedir_ DIR *d = NULL; + bool changed = false; + struct dirent *de; + int r; + + assert(fd >= 0); + assert(st); + + d = fdopendir(fd); + if (!d) { + safe_close(fd); + return -errno; + } + + FOREACH_DIRENT_ALL(de, d, return -errno) { + _cleanup_close_ int path_fd = -1; + struct stat fst; + + if (dot_or_dot_dot(de->d_name)) + continue; + + /* Let's pin the child inode we want to fix now with an O_PATH fd, so that it cannot be swapped out + * while we manipulate it. */ + path_fd = openat(dirfd(d), de->d_name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (path_fd < 0) + return -errno; + + if (fstat(path_fd, &fst) < 0) + return -errno; + + if (S_ISDIR(fst.st_mode)) { + int subdir_fd; + + /* Convert it to a "real" (i.e. non-O_PATH) fd now */ + subdir_fd = fd_reopen(path_fd, O_RDONLY|O_CLOEXEC|O_NOATIME); + if (subdir_fd < 0) + return subdir_fd; + + r = chown_recursive_internal(subdir_fd, &fst, uid, gid); /* takes possession of subdir_fd even on failure */ + if (r < 0) + return r; + if (r > 0) + changed = true; + } else { + r = chown_one(path_fd, &fst, uid, gid); + if (r < 0) + return r; + if (r > 0) + changed = true; + } + } + + r = chown_one(dirfd(d), st, uid, gid); + if (r < 0) + return r; + + return r > 0 || changed; +} + +int path_chown_recursive(const char *path, uid_t uid, gid_t gid) { + _cleanup_close_ int fd = -1; + struct stat st; + + fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (fd < 0) + return -errno; + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; /* nothing to do */ + + if (fstat(fd, &st) < 0) + return -errno; + + /* Let's take a shortcut: if the top-level directory is properly owned, we don't descend into the whole tree, + * under the assumption that all is OK anyway. */ + + if ((!uid_is_valid(uid) || st.st_uid == uid) && + (!gid_is_valid(gid) || st.st_gid == gid)) + return 0; + + return chown_recursive_internal(TAKE_FD(fd), &st, uid, gid); /* we donate the fd to the call, regardless if it succeeded or failed */ +} diff --git a/src/core/chown-recursive.h b/src/core/chown-recursive.h new file mode 100644 index 0000000..f3fa40a --- /dev/null +++ b/src/core/chown-recursive.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <sys/types.h> + +int path_chown_recursive(const char *path, uid_t uid, gid_t gid); diff --git a/src/core/dbus-automount.c b/src/core/dbus-automount.c new file mode 100644 index 0000000..bd6e6a9 --- /dev/null +++ b/src/core/dbus-automount.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "automount.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-util.h" +#include "string-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, automount_result, AutomountResult); + +const sd_bus_vtable bus_automount_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Automount, where), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Automount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Automount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutIdleUSec", "t", bus_property_get_usec, offsetof(Automount, timeout_idle_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int bus_automount_set_transient_property( + Automount *a, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(a); + + assert(a); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Where")) + return bus_set_transient_path(u, name, &a->where, message, flags, error); + + if (streq(name, "TimeoutIdleUSec")) + return bus_set_transient_usec_fix_0(u, name, &a->timeout_idle_usec, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &a->directory_mode, message, flags, error); + + return 0; +} + +int bus_automount_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) /* This is a transient unit? let's load a little more */ + return bus_automount_set_transient_property(a, name, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-automount.h b/src/core/dbus-automount.h new file mode 100644 index 0000000..3e165b0 --- /dev/null +++ b/src/core/dbus-automount.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_automount_vtable[]; + +int bus_automount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c new file mode 100644 index 0000000..53890bc --- /dev/null +++ b/src/core/dbus-cgroup.c @@ -0,0 +1,1415 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <arpa/inet.h> +#include <stdio_ext.h> + +#include "af-list.h" +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "dbus-cgroup.h" +#include "dbus-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "path-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); + +static int property_get_cgroup_mask( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupMask *mask = userdata; + CGroupController ctrl; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) { + if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0) + continue; + + r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_delegate_controllers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + if (!c->delegate) + return sd_bus_message_append(reply, "as", 0); + + return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error); +} + +static int property_get_io_device_weight( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupIODeviceWeight *w; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_weights, w, c->io_device_weights) { + r = sd_bus_message_append(reply, "(st)", w->path, w->weight); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_io_device_limits( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupIODeviceLimit *l; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_limits, l, c->io_device_limits) { + CGroupIOLimitType type; + + type = cgroup_io_limit_type_from_string(property); + if (type < 0 || l->limits[type] == cgroup_io_limit_defaults[type]) + continue; + + r = sd_bus_message_append(reply, "(st)", l->path, l->limits[type]); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_io_device_latency( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupIODeviceLatency *l; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) { + r = sd_bus_message_append(reply, "(st)", l->path, l->target_usec); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_blockio_device_weight( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupBlockIODeviceWeight *w; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + r = sd_bus_message_append(reply, "(st)", w->path, w->weight); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_blockio_device_bandwidths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupBlockIODeviceBandwidth *b; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + uint64_t v; + + if (streq(property, "BlockIOReadBandwidth")) + v = b->rbps; + else + v = b->wbps; + + if (v == CGROUP_LIMIT_MAX) + continue; + + r = sd_bus_message_append(reply, "(st)", b->path, v); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_device_allow( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupDeviceAllow *a; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(device_allow, a, c->device_allow) { + unsigned k = 0; + char rwm[4]; + + if (a->r) + rwm[k++] = 'r'; + if (a->w) + rwm[k++] = 'w'; + if (a->m) + rwm[k++] = 'm'; + + rwm[k] = 0; + + r = sd_bus_message_append(reply, "(ss)", a->path, rwm); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_ip_address_access( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + IPAddressAccessItem** items = userdata, *i; + int r; + + r = sd_bus_message_open_container(reply, 'a', "(iayu)"); + if (r < 0) + return r; + + LIST_FOREACH(items, i, *items) { + + r = sd_bus_message_open_container(reply, 'r', "iayu"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", i->family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family)); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), + SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0), + SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0), + SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0), + SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0), + SD_BUS_PROPERTY("CPUShares", "t", NULL, offsetof(CGroupContext, cpu_shares), 0), + SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0), + SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0), + SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0), + SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0), + SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0), + SD_BUS_PROPERTY("IODeviceWeight", "a(st)", property_get_io_device_weight, 0, 0), + SD_BUS_PROPERTY("IOReadBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOWriteBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOReadIOPSMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOWriteIOPSMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IODeviceLatencyTargetUSec", "a(st)", property_get_io_device_latency, 0, 0), + SD_BUS_PROPERTY("BlockIOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, blockio_accounting), 0), + SD_BUS_PROPERTY("BlockIOWeight", "t", NULL, offsetof(CGroupContext, blockio_weight), 0), + SD_BUS_PROPERTY("StartupBlockIOWeight", "t", NULL, offsetof(CGroupContext, startup_blockio_weight), 0), + SD_BUS_PROPERTY("BlockIODeviceWeight", "a(st)", property_get_blockio_device_weight, 0, 0), + SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), + SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), + SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0), + SD_BUS_PROPERTY("MemoryMin", "t", NULL, offsetof(CGroupContext, memory_min), 0), + SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0), + SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0), + SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0), + SD_BUS_PROPERTY("MemorySwapMax", "t", NULL, offsetof(CGroupContext, memory_swap_max), 0), + SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0), + SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0), + SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0), + SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0), + SD_BUS_PROPERTY("TasksMax", "t", NULL, offsetof(CGroupContext, tasks_max), 0), + SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0), + SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0), + SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0), + SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), + SD_BUS_VTABLE_END +}; + +static int bus_cgroup_set_transient_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Delegate")) { + int b; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->delegate = b; + c->delegate_controllers = b ? _CGROUP_MASK_ALL : 0; + + unit_write_settingf(u, flags, name, "Delegate=%s", yes_no(b)); + } + + return 1; + + } else if (streq(name, "DelegateControllers")) { + CGroupMask mask = 0; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + CGroupController cc; + const char *t; + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + if (r == 0) + break; + + cc = cgroup_controller_from_string(t); + if (cc < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown cgroup controller '%s'", t); + + mask |= CGROUP_CONTROLLER_TO_MASK(cc); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *t = NULL; + + r = cg_mask_to_string(mask, &t); + if (r < 0) + return r; + + c->delegate = true; + if (mask == 0) + c->delegate_controllers = 0; + else + c->delegate_controllers |= mask; + + unit_write_settingf(u, flags, name, "Delegate=%s", strempty(t)); + } + + return 1; + } + + return 0; +} + +static int bus_cgroup_set_boolean( + Unit *u, + const char *name, + bool *p, + CGroupMask mask, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int b, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = b; + unit_invalidate_cgroup(u, mask); + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(b)); + } + + return 1; +} + +#define BUS_DEFINE_SET_CGROUP_WEIGHT(function, mask, check, val) \ + static int bus_cgroup_set_##function( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "t", &v); \ + if (r < 0) \ + return r; \ + \ + if (!check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, (mask)); \ + \ + if (v == (val)) \ + unit_write_settingf(u, flags, name, \ + "%s=", name); \ + else \ + unit_write_settingf(u, flags, name, \ + "%s=%" PRIu64, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_CGROUP_LIMIT(function, mask, scale, minimum) \ + static int bus_cgroup_set_##function( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "t", &v); \ + if (r < 0) \ + return r; \ + \ + if (v < minimum) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, (mask)); \ + \ + if (v == CGROUP_LIMIT_MAX) \ + unit_write_settingf(u, flags, name, \ + "%s=infinity", name); \ + else \ + unit_write_settingf(u, flags, name, \ + "%s=%" PRIu64, name, v); \ + } \ + \ + return 1; \ + } \ + static int bus_cgroup_set_##function##_scale( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + uint32_t raw; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "u", &raw); \ + if (r < 0) \ + return r; \ + \ + v = scale(raw, UINT32_MAX); \ + if (v < minimum || v >= UINT64_MAX) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + const char *e; \ + \ + *p = v; \ + unit_invalidate_cgroup(u, (mask)); \ + \ + /* Chop off suffix */ \ + assert_se(e = endswith(name, "Scale")); \ + name = strndupa(name, e - name); \ + \ + unit_write_settingf(u, flags, name, "%s=%" PRIu32 "%%", name, \ + (uint32_t) (DIV_ROUND_UP((uint64_t) raw * 100U, (uint64_t) UINT32_MAX))); \ + } \ + \ + return 1; \ + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtype-limits" +BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_weight, CGROUP_MASK_CPU, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_shares, CGROUP_MASK_CPU, CGROUP_CPU_SHARES_IS_OK, CGROUP_CPU_SHARES_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(io_weight, CGROUP_MASK_IO, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEIGHT_IS_OK, CGROUP_BLKIO_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1); +BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0); +BUS_DEFINE_SET_CGROUP_LIMIT(tasks_max, CGROUP_MASK_PIDS, system_tasks_max_scale, 1); +#pragma GCC diagnostic pop + +int bus_cgroup_set_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + CGroupIOLimitType iol_type; + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "CPUAccounting")) + return bus_cgroup_set_boolean(u, name, &c->cpu_accounting, get_cpu_accounting_mask(), message, flags, error); + + if (streq(name, "CPUWeight")) + return bus_cgroup_set_cpu_weight(u, name, &c->cpu_weight, message, flags, error); + + if (streq(name, "StartupCPUWeight")) + return bus_cgroup_set_cpu_weight(u, name, &c->startup_cpu_weight, message, flags, error); + + if (streq(name, "CPUShares")) + return bus_cgroup_set_cpu_shares(u, name, &c->cpu_shares, message, flags, error); + + if (streq(name, "StartupCPUShares")) + return bus_cgroup_set_cpu_shares(u, name, &c->startup_cpu_shares, message, flags, error); + + if (streq(name, "IOAccounting")) + return bus_cgroup_set_boolean(u, name, &c->io_accounting, CGROUP_MASK_IO, message, flags, error); + + if (streq(name, "IOWeight")) + return bus_cgroup_set_io_weight(u, name, &c->io_weight, message, flags, error); + + if (streq(name, "StartupIOWeight")) + return bus_cgroup_set_io_weight(u, name, &c->startup_io_weight, message, flags, error); + + if (streq(name, "BlockIOAccounting")) + return bus_cgroup_set_boolean(u, name, &c->blockio_accounting, CGROUP_MASK_BLKIO, message, flags, error); + + if (streq(name, "BlockIOWeight")) + return bus_cgroup_set_blockio_weight(u, name, &c->blockio_weight, message, flags, error); + + if (streq(name, "StartupBlockIOWeight")) + return bus_cgroup_set_blockio_weight(u, name, &c->startup_blockio_weight, message, flags, error); + + if (streq(name, "MemoryAccounting")) + return bus_cgroup_set_boolean(u, name, &c->memory_accounting, CGROUP_MASK_MEMORY, message, flags, error); + + if (streq(name, "MemoryMin")) + return bus_cgroup_set_memory(u, name, &c->memory_min, message, flags, error); + + if (streq(name, "MemoryLow")) + return bus_cgroup_set_memory(u, name, &c->memory_low, message, flags, error); + + if (streq(name, "MemoryHigh")) + return bus_cgroup_set_memory(u, name, &c->memory_high, message, flags, error); + + if (streq(name, "MemorySwapMax")) + return bus_cgroup_set_swap(u, name, &c->memory_swap_max, message, flags, error); + + if (streq(name, "MemoryMax")) + return bus_cgroup_set_memory(u, name, &c->memory_max, message, flags, error); + + if (streq(name, "MemoryLimit")) + return bus_cgroup_set_memory(u, name, &c->memory_limit, message, flags, error); + + if (streq(name, "MemoryMinScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_min, message, flags, error); + + if (streq(name, "MemoryLowScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_low, message, flags, error); + + if (streq(name, "MemoryHighScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_high, message, flags, error); + + if (streq(name, "MemorySwapMaxScale")) + return bus_cgroup_set_swap_scale(u, name, &c->memory_swap_max, message, flags, error); + + if (streq(name, "MemoryMaxScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_max, message, flags, error); + + if (streq(name, "MemoryLimitScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error); + + if (streq(name, "TasksAccounting")) + return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error); + + if (streq(name, "TasksMax")) + return bus_cgroup_set_tasks_max(u, name, &c->tasks_max, message, flags, error); + + if (streq(name, "TasksMaxScale")) + return bus_cgroup_set_tasks_max_scale(u, name, &c->tasks_max, message, flags, error); + + if (streq(name, "CPUQuotaPerSecUSec")) { + uint64_t u64; + + r = sd_bus_message_read(message, "t", &u64); + if (r < 0) + return r; + + if (u64 <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "CPUQuotaPerSecUSec= value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_quota_per_sec_usec = u64; + unit_invalidate_cgroup(u, CGROUP_MASK_CPU); + + if (c->cpu_quota_per_sec_usec == USEC_INFINITY) + unit_write_setting(u, flags, "CPUQuota", "CPUQuota="); + else + /* config_parse_cpu_quota() requires an integer, so truncating division is used on + * purpose here. */ + unit_write_settingf(u, flags, "CPUQuota", + "CPUQuota=%0.f%%", + (double) (c->cpu_quota_per_sec_usec / 10000)); + } + + return 1; + + } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) { + const char *path; + unsigned n = 0; + uint64_t u64; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLimit *a = NULL, *b; + + LIST_FOREACH(device_limits, b, c->io_device_limits) { + if (path_equal(path, b->path)) { + a = b; + break; + } + } + + if (!a) { + CGroupIOLimitType type; + + a = new0(CGroupIODeviceLimit, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + a->limits[type] = cgroup_io_limit_defaults[type]; + + LIST_PREPEND(device_limits, c->io_device_limits, a); + } + + a->limits[iol_type] = u64; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLimit *a; + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size = 0; + + if (n == 0) { + LIST_FOREACH(device_limits, a, c->io_device_limits) + a->limits[iol_type] = cgroup_io_limit_defaults[iol_type]; + } + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fprintf(f, "%s=\n", name); + LIST_FOREACH(device_limits, a, c->io_device_limits) + if (a->limits[iol_type] != cgroup_io_limit_defaults[iol_type]) + fprintf(f, "%s=%s %" PRIu64 "\n", name, a->path, a->limits[iol_type]); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IODeviceWeight")) { + const char *path; + uint64_t weight; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!CGROUP_WEIGHT_IS_OK(weight) || weight == CGROUP_WEIGHT_INVALID) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IODeviceWeight= value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceWeight *a = NULL, *b; + + LIST_FOREACH(device_weights, b, c->io_device_weights) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupIODeviceWeight, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_weights, c->io_device_weights, a); + } + + a->weight = weight; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupIODeviceWeight *a; + size_t size = 0; + + if (n == 0) { + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs("IODeviceWeight=\n", f); + LIST_FOREACH(device_weights, a, c->io_device_weights) + fprintf(f, "IODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IODeviceLatencyTargetUSec")) { + const char *path; + uint64_t target; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &target)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLatency *a = NULL, *b; + + LIST_FOREACH(device_latencies, b, c->io_device_latencies) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupIODeviceLatency, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_latencies, c->io_device_latencies, a); + } + + a->target_usec = target; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + char ts[FORMAT_TIMESPAN_MAX]; + CGroupIODeviceLatency *a; + size_t size = 0; + + if (n == 0) { + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs("IODeviceLatencyTargetSec=\n", f); + LIST_FOREACH(device_latencies, a, c->io_device_latencies) + fprintf(f, "IODeviceLatencyTargetSec=%s %s\n", + a->path, format_timespan(ts, sizeof(ts), a->target_usec, 1)); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) { + const char *path; + bool read = true; + unsigned n = 0; + uint64_t u64; + + if (streq(name, "BlockIOWriteBandwidth")) + read = false; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceBandwidth *a = NULL, *b; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + if (path_equal(path, b->path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupBlockIODeviceBandwidth, 1); + if (!a) + return -ENOMEM; + + a->rbps = CGROUP_LIMIT_MAX; + a->wbps = CGROUP_LIMIT_MAX; + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a); + } + + if (read) + a->rbps = u64; + else + a->wbps = u64; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceBandwidth *a; + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size = 0; + + if (n == 0) { + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) { + if (read) + a->rbps = CGROUP_LIMIT_MAX; + else + a->wbps = CGROUP_LIMIT_MAX; + } + } + + unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + if (read) { + fputs("BlockIOReadBandwidth=\n", f); + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) + if (a->rbps != CGROUP_LIMIT_MAX) + fprintf(f, "BlockIOReadBandwidth=%s %" PRIu64 "\n", a->path, a->rbps); + } else { + fputs("BlockIOWriteBandwidth=\n", f); + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) + if (a->wbps != CGROUP_LIMIT_MAX) + fprintf(f, "BlockIOWriteBandwidth=%s %" PRIu64 "\n", a->path, a->wbps); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "BlockIODeviceWeight")) { + const char *path; + uint64_t weight; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight) || weight == CGROUP_BLKIO_WEIGHT_INVALID) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "BlockIODeviceWeight= out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceWeight *a = NULL, *b; + + LIST_FOREACH(device_weights, b, c->blockio_device_weights) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupBlockIODeviceWeight, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_weights, c->blockio_device_weights, a); + } + + a->weight = weight; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupBlockIODeviceWeight *a; + size_t size = 0; + + if (n == 0) { + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs("BlockIODeviceWeight=\n", f); + LIST_FOREACH(device_weights, a, c->blockio_device_weights) + fprintf(f, "BlockIODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight); + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "DevicePolicy")) { + const char *policy; + CGroupDevicePolicy p; + + r = sd_bus_message_read(message, "s", &policy); + if (r < 0) + return r; + + p = cgroup_device_policy_from_string(policy); + if (p < 0) + return -EINVAL; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->device_policy = p; + unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); + unit_write_settingf(u, flags, name, "DevicePolicy=%s", policy); + } + + return 1; + + } else if (streq(name, "DeviceAllow")) { + const char *path, *rwm; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &path, &rwm)) > 0) { + + if (!valid_device_allow_pattern(path) || strpbrk(path, WHITESPACE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires device node or pattern"); + + if (isempty(rwm)) + rwm = "rwm"; + else if (!in_charset(rwm, "rwm")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires combination of rwm flags"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupDeviceAllow *a = NULL, *b; + + LIST_FOREACH(device_allow, b, c->device_allow) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupDeviceAllow, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + LIST_PREPEND(device_allow, c->device_allow, a); + } + + a->r = !!strchr(rwm, 'r'); + a->w = !!strchr(rwm, 'w'); + a->m = !!strchr(rwm, 'm'); + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupDeviceAllow *a; + size_t size = 0; + + if (n == 0) { + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs("DeviceAllow=\n", f); + LIST_FOREACH(device_allow, a, c->device_allow) + fprintf(f, "DeviceAllow=%s %s%s%s\n", a->path, a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : ""); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IPAccounting")) { + int b; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->ip_accounting = b; + + unit_invalidate_cgroup_bpf(u); + unit_write_settingf(u, flags, name, "IPAccounting=%s", yes_no(b)); + } + + return 1; + + } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) { + IPAddressAccessItem **list; + size_t n = 0; + + list = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny; + + r = sd_bus_message_enter_container(message, 'a', "(iayu)"); + if (r < 0) + return r; + + for (;;) { + const void *ap; + int32_t family; + uint32_t prefixlen; + size_t an; + + r = sd_bus_message_enter_container(message, 'r', "iayu"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(message, "i", &family); + if (r < 0) + return r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects IPv4 or IPv6 addresses only.", name); + + r = sd_bus_message_read_array(message, 'y', &ap, &an); + if (r < 0) + return r; + + if (an != FAMILY_ADDRESS_SIZE(family)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IP address has wrong size for family (%s, expected %zu, got %zu)", + af_to_name(family), FAMILY_ADDRESS_SIZE(family), an); + + r = sd_bus_message_read(message, "u", &prefixlen); + if (r < 0) + return r; + + if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Prefix length %" PRIu32 " too large for address family %s.", prefixlen, af_to_name(family)); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + IPAddressAccessItem *item; + + item = new0(IPAddressAccessItem, 1); + if (!item) + return -ENOMEM; + + item->family = family; + item->prefixlen = prefixlen; + memcpy(&item->address, ap, an); + + LIST_PREPEND(items, *list, item); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + *list = ip_address_access_reduce(*list); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + IPAddressAccessItem *item; + size_t size = 0; + + if (n == 0) + *list = ip_address_access_free_all(*list); + + unit_invalidate_cgroup_bpf(u); + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs(name, f); + fputs("=\n", f); + + LIST_FOREACH(items, item, *list) { + char buffer[CONST_MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)]; + + errno = 0; + if (!inet_ntop(item->family, &item->address, buffer, sizeof(buffer))) + return errno > 0 ? -errno : -EINVAL; + + fprintf(f, "%s=%s/%u\n", name, buffer, item->prefixlen); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + + if (*list) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == BPF_FIREWALL_UNSUPPORTED) { + static bool warned = false; + + log_full(warned ? LOG_DEBUG : LOG_WARNING, + "Transient unit %s configures an IP firewall, but the local system does not support BPF/cgroup firewalling.\n" + "Proceeding WITHOUT firewalling in effect! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id); + + warned = true; + } + } + } + + return 1; + } + + if (u->transient && u->load_state == UNIT_STUB) + return bus_cgroup_set_transient_property(u, c, name, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h new file mode 100644 index 0000000..188baa9 --- /dev/null +++ b/src/core/dbus-cgroup.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" +#include "cgroup.h" + +extern const sd_bus_vtable bus_cgroup_vtable[]; + +int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-device.c b/src/core/dbus-device.c new file mode 100644 index 0000000..6cf7f58 --- /dev/null +++ b/src/core/dbus-device.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "dbus-device.h" +#include "device.h" +#include "unit.h" + +const sd_bus_vtable bus_device_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("SysFSPath", "s", NULL, offsetof(Device, sysfs), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; diff --git a/src/core/dbus-device.h b/src/core/dbus-device.h new file mode 100644 index 0000000..077a2bf --- /dev/null +++ b/src/core/dbus-device.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus-vtable.h" + +extern const sd_bus_vtable bus_device_vtable[]; diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c new file mode 100644 index 0000000..11301e4 --- /dev/null +++ b/src/core/dbus-execute.c @@ -0,0 +1,2423 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <sys/mount.h> +#include <sys/prctl.h> +#include <stdio_ext.h> + +#if HAVE_SECCOMP +#include <seccomp.h> +#endif + +#include "af-list.h" +#include "alloc-util.h" +#include "bus-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cpu-set-util.h" +#include "dbus-execute.h" +#include "dbus-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "ioprio.h" +#include "journal-util.h" +#include "missing.h" +#include "mountpoint-util.h" +#include "namespace.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "securebits-util.h" +#include "specifier.h" +#include "strv.h" +#include "syslog-util.h" +#include "unit-printf.h" +#include "user-util.h" +#include "utf8.h" + +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutput); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long); +static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio); +static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_class, "i", ExecContext, exec_context_get_effective_ioprio, IOPRIO_PRIO_CLASS); +static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, exec_context_get_effective_ioprio, IOPRIO_PRIO_DATA); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); + +static int property_get_environment_files( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + char **j; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(sb)"); + if (r < 0) + return r; + + STRV_FOREACH(j, c->environment_files) { + const char *fn = *j; + + r = sd_bus_message_append(reply, "(sb)", fn[0] == '-' ? fn + 1 : fn, fn[0] == '-'); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_oom_score_adjust( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->oom_score_adjust_set) + n = c->oom_score_adjust; + else { + _cleanup_free_ char *t = NULL; + + n = 0; + if (read_one_line_file("/proc/self/oom_score_adj", &t) >= 0) + safe_atoi32(t, &n); + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_nice( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->nice_set) + n = c->nice; + else { + errno = 0; + n = getpriority(PRIO_PROCESS, 0); + if (errno > 0) + n = 0; + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_cpu_sched_policy( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->cpu_sched_set) + n = c->cpu_sched_policy; + else { + n = sched_getscheduler(0); + if (n < 0) + n = SCHED_OTHER; + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_cpu_sched_priority( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->cpu_sched_set) + n = c->cpu_sched_priority; + else { + struct sched_param p = {}; + + if (sched_getparam(0, &p) >= 0) + n = p.sched_priority; + else + n = 0; + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_cpu_affinity( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append_array(reply, 'y', c->cpuset, CPU_ALLOC_SIZE(c->cpuset_ncpus)); +} + +static int property_get_timer_slack_nsec( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + uint64_t u; + + assert(bus); + assert(reply); + assert(c); + + if (c->timer_slack_nsec != NSEC_INFINITY) + u = (uint64_t) c->timer_slack_nsec; + else + u = (uint64_t) prctl(PR_GET_TIMERSLACK); + + return sd_bus_message_append(reply, "t", u); +} + +static int property_get_syscall_filter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + int r; + +#if HAVE_SECCOMP + Iterator i; + void *id, *val; +#endif + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->syscall_whitelist); + if (r < 0) + return r; + +#if HAVE_SECCOMP + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, i) { + _cleanup_free_ char *name = NULL; + const char *e = NULL; + char *s; + int num = PTR_TO_INT(val); + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + if (!name) + continue; + + if (num >= 0) { + e = errno_to_name(num); + if (e) { + s = strjoin(name, ":", e); + if (!s) + return -ENOMEM; + } else { + r = asprintf(&s, "%s:%d", name, num); + if (r < 0) + return -ENOMEM; + } + } else + s = TAKE_PTR(name); + + r = strv_consume(&l, s); + if (r < 0) + return r; + } +#endif + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_syscall_archs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + int r; + +#if HAVE_SECCOMP + Iterator i; + void *id; +#endif + + assert(bus); + assert(reply); + assert(c); + +#if HAVE_SECCOMP + SET_FOREACH(id, c->syscall_archs, i) { + const char *name; + + name = seccomp_arch_to_string(PTR_TO_UINT32(id) - 1); + if (!name) + continue; + + r = strv_extend(&l, name); + if (r < 0) + return -ENOMEM; + } +#endif + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return 0; +} + +static int property_get_selinux_context( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append(reply, "(bs)", c->selinux_context_ignore, c->selinux_context); +} + +static int property_get_apparmor_profile( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append(reply, "(bs)", c->apparmor_profile_ignore, c->apparmor_profile); +} + +static int property_get_smack_process_label( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append(reply, "(bs)", c->smack_process_label_ignore, c->smack_process_label); +} + +static int property_get_address_families( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + Iterator i; + void *af; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->address_families_whitelist); + if (r < 0) + return r; + + SET_FOREACH(af, c->address_families, i) { + const char *name; + + name = af_to_name(PTR_TO_INT(af)); + if (!name) + continue; + + r = strv_extend(&l, name); + if (r < 0) + return -ENOMEM; + } + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_working_directory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + const char *wd; + + assert(bus); + assert(reply); + assert(c); + + if (c->working_directory_home) + wd = "~"; + else + wd = c->working_directory; + + if (c->working_directory_missing_ok) + wd = strjoina("!", wd); + + return sd_bus_message_append(reply, "s", wd); +} + +static int property_get_stdio_fdname( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int fileno; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + if (streq(property, "StandardInputFileDescriptorName")) + fileno = STDIN_FILENO; + else if (streq(property, "StandardOutputFileDescriptorName")) + fileno = STDOUT_FILENO; + else { + assert(streq(property, "StandardErrorFileDescriptorName")); + fileno = STDERR_FILENO; + } + + return sd_bus_message_append(reply, "s", exec_context_fdname(c, fileno)); +} + +static int property_get_input_data( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size); +} + +static int property_get_bind_paths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + unsigned i; + bool ro; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + ro = strstr(property, "ReadOnly"); + + r = sd_bus_message_open_container(reply, 'a', "(ssbt)"); + if (r < 0) + return r; + + for (i = 0; i < c->n_bind_mounts; i++) { + + if (ro != c->bind_mounts[i].read_only) + continue; + + r = sd_bus_message_append( + reply, "(ssbt)", + c->bind_mounts[i].source, + c->bind_mounts[i].destination, + c->bind_mounts[i].ignore_enoent, + c->bind_mounts[i].recursive ? (uint64_t) MS_REC : (uint64_t) 0); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_temporary_filesystems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + unsigned i; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + for (i = 0; i < c->n_temporary_filesystems; i++) { + TemporaryFileSystem *t = c->temporary_filesystems + i; + + r = sd_bus_message_append( + reply, "(ss)", + t->path, + t->options); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_log_extra_fields( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + size_t i; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "ay"); + if (r < 0) + return r; + + for (i = 0; i < c->n_log_extra_fields; i++) { + r = sd_bus_message_append_array(reply, 'y', c->log_extra_fields[i].iov_base, c->log_extra_fields[i].iov_len); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_exec_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("EnvironmentFiles", "a(sb)", property_get_environment_files, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassEnvironment", "as", NULL, offsetof(ExecContext, pass_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UnsetEnvironment", "as", NULL, offsetof(ExecContext, unset_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UMask", "u", bus_property_get_mode, offsetof(ExecContext, umask), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCPU", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCPUSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitFSIZE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitDATA", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitDATASoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSTACK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCORE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCORESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRSS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRSSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNOFILE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitAS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitASSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNPROC", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitLOCKS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNICE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNICESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTPRIO", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTTIME", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IOSchedulingClass", "i", property_get_ioprio_class, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IOSchedulingPriority", "i", property_get_ioprio_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputData", "ay", property_get_input_data, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYVTDisallocate", "b", bus_property_get_bool, offsetof(ExecContext, tty_vt_disallocate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogPriority", "i", bus_property_get_int, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogIdentifier", "s", NULL, offsetof(ExecContext, syslog_identifier), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogLevelPrefix", "b", bus_property_get_bool, offsetof(ExecContext, syslog_level_prefix), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogLevel", "i", property_get_syslog_level, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogFacility", "i", property_get_syslog_facility, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogLevelMax", "i", bus_property_get_int, offsetof(ExecContext, log_level_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogRateLimitIntervalUSec", "t", bus_property_get_usec, offsetof(ExecContext, log_rate_limit_interval_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogRateLimitBurst", "u", bus_property_get_unsigned, offsetof(ExecContext, log_rate_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogExtraFields", "aay", property_get_log_extra_fields, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SecureBits", "i", bus_property_get_int, offsetof(ExecContext, secure_bits), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CapabilityBoundingSet", "t", NULL, offsetof(ExecContext, capability_bounding_set), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AmbientCapabilities", "t", NULL, offsetof(ExecContext, capability_ambient_set), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadWritePaths", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadOnlyPaths", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_bool, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UtmpMode", "s", property_get_exec_utmp_mode, offsetof(ExecContext, utmp_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AppArmorProfile", "(bs)", property_get_apparmor_profile, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackProcessLabel", "(bs)", property_get_smack_process_label, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IgnoreSIGPIPE", "b", bus_property_get_bool, offsetof(ExecContext, ignore_sigpipe), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoNewPrivileges", "b", bus_property_get_bool, offsetof(ExecContext, no_new_privileges), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallFilter", "(bas)", property_get_syscall_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallErrorNumber", "i", bus_property_get_int, offsetof(ExecContext, syscall_errno), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Personality", "s", property_get_personality, offsetof(ExecContext, personality), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfigurationDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfigurationDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST), + + /* Obsolete/redundant properties: */ + SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("ReadOnlyDirectories", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("InaccessibleDirectories", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + + SD_BUS_VTABLE_END +}; + +static int append_exec_command(sd_bus_message *reply, ExecCommand *c) { + int r; + + assert(reply); + assert(c); + + if (!c->path) + return 0; + + r = sd_bus_message_open_container(reply, 'r', "sasbttttuii"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", c->path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, c->argv); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "bttttuii", + !!(c->flags & EXEC_COMMAND_IGNORE_FAILURE), + c->exec_status.start_timestamp.realtime, + c->exec_status.start_timestamp.monotonic, + c->exec_status.exit_timestamp.realtime, + c->exec_status.exit_timestamp.monotonic, + (uint32_t) c->exec_status.pid, + (int32_t) c->exec_status.code, + (int32_t) c->exec_status.status); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_command( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *c = (ExecCommand*) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)"); + if (r < 0) + return r; + + r = append_exec_command(reply, c); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_command_list( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *c = *(ExecCommand**) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)"); + if (r < 0) + return r; + + LIST_FOREACH(command, c, c) { + r = append_exec_command(reply, c); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +int bus_set_transient_exec_command( + Unit *u, + const char *name, + ExecCommand **exec_command, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + unsigned n = 0; + int r; + + r = sd_bus_message_enter_container(message, 'a', "(sasb)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, 'r', "sasb")) > 0) { + _cleanup_strv_free_ char **argv = NULL; + const char *path; + int b; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + r = sd_bus_message_read_strv(message, &argv); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + ExecCommand *c; + + c = new0(ExecCommand, 1); + if (!c) + return -ENOMEM; + + c->path = strdup(path); + if (!c->path) { + free(c); + return -ENOMEM; + } + + c->argv = TAKE_PTR(argv); + + c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0; + + path_simplify(c->path, false); + exec_command_append_list(exec_command, c); + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + ExecCommand *c; + size_t size = 0; + + if (n == 0) + *exec_command = exec_command_free_list(*exec_command); + + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs("ExecStart=\n", f); + + LIST_FOREACH(command, c, *exec_command) { + _cleanup_free_ char *a = NULL, *t = NULL; + const char *p; + + p = unit_escape_setting(c->path, UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS, &t); + if (!p) + return -ENOMEM; + + a = unit_concat_strv(c->argv, UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS); + if (!a) + return -ENOMEM; + + fprintf(f, "%s=%s@%s %s\n", + name, + c->flags & EXEC_COMMAND_IGNORE_FAILURE ? "-" : "", + p, + a); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; +} + +static int parse_personality(const char *s, unsigned long *p) { + unsigned long v; + + assert(p); + + v = personality_from_string(s); + if (v == PERSONALITY_INVALID) + return -EINVAL; + + *p = v; + return 0; +} + +static const char* mount_propagation_flags_to_string_with_check(unsigned long n) { + if (!IN_SET(n, 0, MS_SHARED, MS_PRIVATE, MS_SLAVE)) + return NULL; + + return mount_propagation_flags_to_string(n); +} + +static BUS_DEFINE_SET_TRANSIENT(nsec, "t", uint64_t, nsec_t, NSEC_FMT); +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(log_level, "i", int32_t, int, "%" PRIi32, log_level_is_valid); +#if HAVE_SECCOMP +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(errno, "i", int32_t, int, "%" PRIi32, errno_is_valid); +#endif +static BUS_DEFINE_SET_TRANSIENT_PARSE(std_input, ExecInput, exec_input_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(std_output, ExecOutput, exec_output_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(capability, "t", uint64_t, uint64_t, "%" PRIu64, capability_set_to_string_alloc); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(namespace_flag, "t", uint64_t, unsigned long, "%" PRIu64, namespace_flags_to_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(mount_flags, "t", uint64_t, unsigned long, "%" PRIu64, mount_propagation_flags_to_string_with_check); + +int bus_exec_context_set_transient_property( + Unit *u, + ExecContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *suffix; + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "User")) + return bus_set_transient_user(u, name, &c->user, message, flags, error); + + if (streq(name, "Group")) + return bus_set_transient_user(u, name, &c->group, message, flags, error); + + if (streq(name, "TTYPath")) + return bus_set_transient_path(u, name, &c->tty_path, message, flags, error); + + if (streq(name, "RootImage")) + return bus_set_transient_path(u, name, &c->root_image, message, flags, error); + + if (streq(name, "RootDirectory")) + return bus_set_transient_path(u, name, &c->root_directory, message, flags, error); + + if (streq(name, "SyslogIdentifier")) + return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error); + + if (streq(name, "LogLevelMax")) + return bus_set_transient_log_level(u, name, &c->log_level_max, message, flags, error); + + if (streq(name, "LogRateLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &c->log_rate_limit_interval_usec, message, flags, error); + + if (streq(name, "LogRateLimitBurst")) + return bus_set_transient_unsigned(u, name, &c->log_rate_limit_burst, message, flags, error); + + if (streq(name, "Personality")) + return bus_set_transient_personality(u, name, &c->personality, message, flags, error); + + if (streq(name, "StandardInput")) + return bus_set_transient_std_input(u, name, &c->std_input, message, flags, error); + + if (streq(name, "StandardOutput")) + return bus_set_transient_std_output(u, name, &c->std_output, message, flags, error); + + if (streq(name, "StandardError")) + return bus_set_transient_std_output(u, name, &c->std_error, message, flags, error); + + if (streq(name, "IgnoreSIGPIPE")) + return bus_set_transient_bool(u, name, &c->ignore_sigpipe, message, flags, error); + + if (streq(name, "TTYVHangup")) + return bus_set_transient_bool(u, name, &c->tty_vhangup, message, flags, error); + + if (streq(name, "TTYReset")) + return bus_set_transient_bool(u, name, &c->tty_reset, message, flags, error); + + if (streq(name, "TTYVTDisallocate")) + return bus_set_transient_bool(u, name, &c->tty_vt_disallocate, message, flags, error); + + if (streq(name, "PrivateTmp")) + return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error); + + if (streq(name, "PrivateDevices")) + return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); + + if (streq(name, "PrivateMounts")) + return bus_set_transient_bool(u, name, &c->private_mounts, message, flags, error); + + if (streq(name, "PrivateNetwork")) + return bus_set_transient_bool(u, name, &c->private_network, message, flags, error); + + if (streq(name, "PrivateUsers")) + return bus_set_transient_bool(u, name, &c->private_users, message, flags, error); + + if (streq(name, "NoNewPrivileges")) + return bus_set_transient_bool(u, name, &c->no_new_privileges, message, flags, error); + + if (streq(name, "SyslogLevelPrefix")) + return bus_set_transient_bool(u, name, &c->syslog_level_prefix, message, flags, error); + + if (streq(name, "MemoryDenyWriteExecute")) + return bus_set_transient_bool(u, name, &c->memory_deny_write_execute, message, flags, error); + + if (streq(name, "RestrictRealtime")) + return bus_set_transient_bool(u, name, &c->restrict_realtime, message, flags, error); + + if (streq(name, "DynamicUser")) + return bus_set_transient_bool(u, name, &c->dynamic_user, message, flags, error); + + if (streq(name, "RemoveIPC")) + return bus_set_transient_bool(u, name, &c->remove_ipc, message, flags, error); + + if (streq(name, "ProtectKernelTunables")) + return bus_set_transient_bool(u, name, &c->protect_kernel_tunables, message, flags, error); + + if (streq(name, "ProtectKernelModules")) + return bus_set_transient_bool(u, name, &c->protect_kernel_modules, message, flags, error); + + if (streq(name, "ProtectControlGroups")) + return bus_set_transient_bool(u, name, &c->protect_control_groups, message, flags, error); + + if (streq(name, "MountAPIVFS")) + return bus_set_transient_bool(u, name, &c->mount_apivfs, message, flags, error); + + if (streq(name, "CPUSchedulingResetOnFork")) + return bus_set_transient_bool(u, name, &c->cpu_sched_reset_on_fork, message, flags, error); + + if (streq(name, "NonBlocking")) + return bus_set_transient_bool(u, name, &c->non_blocking, message, flags, error); + + if (streq(name, "LockPersonality")) + return bus_set_transient_bool(u, name, &c->lock_personality, message, flags, error); + + if (streq(name, "UtmpIdentifier")) + return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error); + + if (streq(name, "UtmpMode")) + return bus_set_transient_utmp_mode(u, name, &c->utmp_mode, message, flags, error); + + if (streq(name, "PAMName")) + return bus_set_transient_string(u, name, &c->pam_name, message, flags, error); + + if (streq(name, "TimerSlackNSec")) + return bus_set_transient_nsec(u, name, &c->timer_slack_nsec, message, flags, error); + + if (streq(name, "ProtectSystem")) + return bus_set_transient_protect_system(u, name, &c->protect_system, message, flags, error); + + if (streq(name, "ProtectHome")) + return bus_set_transient_protect_home(u, name, &c->protect_home, message, flags, error); + + if (streq(name, "KeyringMode")) + return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error); + + if (streq(name, "RuntimeDirectoryPreserve")) + return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error); + + if (streq(name, "UMask")) + return bus_set_transient_mode_t(u, name, &c->umask, message, flags, error); + + if (streq(name, "RuntimeDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_RUNTIME].mode, message, flags, error); + + if (streq(name, "StateDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_STATE].mode, message, flags, error); + + if (streq(name, "CacheDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CACHE].mode, message, flags, error); + + if (streq(name, "LogsDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_LOGS].mode, message, flags, error); + + if (streq(name, "ConfigurationDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CONFIGURATION].mode, message, flags, error); + + if (streq(name, "SELinuxContext")) + return bus_set_transient_string(u, name, &c->selinux_context, message, flags, error); + + if (streq(name, "SecureBits")) + return bus_set_transient_secure_bits(u, name, &c->secure_bits, message, flags, error); + + if (streq(name, "CapabilityBoundingSet")) + return bus_set_transient_capability(u, name, &c->capability_bounding_set, message, flags, error); + + if (streq(name, "AmbientCapabilities")) + return bus_set_transient_capability(u, name, &c->capability_ambient_set, message, flags, error); + + if (streq(name, "RestrictNamespaces")) + return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error); + + if (streq(name, "MountFlags")) + return bus_set_transient_mount_flags(u, name, &c->mount_flags, message, flags, error); + + if (streq(name, "SupplementaryGroups")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!isempty(*p) && !valid_user_group_name_or_id(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid supplementary group names"); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->supplementary_groups = strv_free(c->supplementary_groups); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&c->supplementary_groups, l, true); + if (r < 0) + return -ENOMEM; + + joined = strv_join(c->supplementary_groups, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (streq(name, "SyslogLevel")) { + int32_t level; + + r = sd_bus_message_read(message, "i", &level); + if (r < 0) + return r; + + if (!log_level_is_valid(level)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Log level value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->syslog_priority = (c->syslog_priority & LOG_FACMASK) | level; + unit_write_settingf(u, flags, name, "SyslogLevel=%i", level); + } + + return 1; + + } else if (streq(name, "SyslogFacility")) { + int32_t facility; + + r = sd_bus_message_read(message, "i", &facility); + if (r < 0) + return r; + + if (!log_facility_unshifted_is_valid(facility)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Log facility value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->syslog_priority = (facility << 3) | LOG_PRI(c->syslog_priority); + unit_write_settingf(u, flags, name, "SyslogFacility=%i", facility); + } + + return 1; + + } else if (streq(name, "LogExtraFields")) { + size_t n = 0; + + r = sd_bus_message_enter_container(message, 'a', "ay"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ void *copy = NULL; + struct iovec *t; + const char *eq; + const void *p; + size_t sz; + + /* Note that we expect a byte array for each field, instead of a string. That's because on the + * lower-level journal fields can actually contain binary data and are not restricted to text, + * and we should not "lose precision" in our types on the way. That said, I am pretty sure + * actually encoding binary data as unit metadata is not a good idea. Hence we actually refuse + * any actual binary data, and only accept UTF-8. This allows us to eventually lift this + * limitation, should a good, valid usecase arise. */ + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + if (r == 0) + break; + + if (memchr(p, 0, sz)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains zero byte"); + + eq = memchr(p, '=', sz); + if (!eq) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains no '=' character"); + if (!journal_field_valid(p, eq - (const char*) p, false)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field invalid"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec)); + if (!t) + return -ENOMEM; + c->log_extra_fields = t; + } + + copy = malloc(sz + 1); + if (!copy) + return -ENOMEM; + + memcpy(copy, p, sz); + ((uint8_t*) copy)[sz] = 0; + + if (!utf8_is_valid(copy)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field is not valid UTF-8"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE(copy, sz); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C, name, "LogExtraFields=%s", (char*) copy); + + copy = NULL; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && n == 0) { + exec_context_free_log_extra_fields(c); + unit_write_setting(u, flags, name, "LogExtraFields="); + } + + return 1; + } + +#if HAVE_SECCOMP + + if (streq(name, "SystemCallErrorNumber")) + return bus_set_transient_errno(u, name, &c->syscall_errno, message, flags, error); + + if (streq(name, "SystemCallFilter")) { + int whitelist; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &whitelist); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + bool invert = !whitelist; + char **s; + + if (strv_isempty(l)) { + c->syscall_whitelist = false; + c->syscall_filter = hashmap_free(c->syscall_filter); + + unit_write_settingf(u, flags, name, "SystemCallFilter="); + return 1; + } + + if (!c->syscall_filter) { + c->syscall_filter = hashmap_new(NULL); + if (!c->syscall_filter) + return log_oom(); + + c->syscall_whitelist = whitelist; + + if (c->syscall_whitelist) { + r = seccomp_parse_syscall_filter("@default", -1, c->syscall_filter, SECCOMP_PARSE_WHITELIST | (invert ? SECCOMP_PARSE_INVERT : 0)); + if (r < 0) + return r; + } + } + + STRV_FOREACH(s, l) { + _cleanup_free_ char *n = NULL; + int e; + + r = parse_syscall_and_errno(*s, &n, &e); + if (r < 0) + return r; + + r = seccomp_parse_syscall_filter(n, e, c->syscall_filter, (invert ? SECCOMP_PARSE_INVERT : 0) | (c->syscall_whitelist ? SECCOMP_PARSE_WHITELIST : 0)); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "SystemCallFilter=%s%s", whitelist ? "" : "~", joined); + } + + return 1; + + } else if (streq(name, "SystemCallArchitectures")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + + if (strv_isempty(l)) + c->syscall_archs = set_free(c->syscall_archs); + else { + char **s; + + r = set_ensure_allocated(&c->syscall_archs, NULL); + if (r < 0) + return r; + + STRV_FOREACH(s, l) { + uint32_t a; + + r = seccomp_arch_from_string(*s, &a); + if (r < 0) + return r; + + r = set_put(c->syscall_archs, UINT32_TO_PTR(a + 1)); + if (r < 0) + return r; + } + + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + + return 1; + + } else if (streq(name, "RestrictAddressFamilies")) { + int whitelist; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &whitelist); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + bool invert = !whitelist; + char **s; + + if (strv_isempty(l)) { + c->address_families_whitelist = false; + c->address_families = set_free(c->address_families); + + unit_write_settingf(u, flags, name, "RestrictAddressFamilies="); + return 1; + } + + if (!c->address_families) { + c->address_families = set_new(NULL); + if (!c->address_families) + return log_oom(); + + c->address_families_whitelist = whitelist; + } + + STRV_FOREACH(s, l) { + int af; + + af = af_from_name(*s); + if (af < 0) + return af; + + if (!invert == c->address_families_whitelist) { + r = set_put(c->address_families, INT_TO_PTR(af)); + if (r < 0) + return r; + } else + (void) set_remove(c->address_families, INT_TO_PTR(af)); + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s%s", whitelist ? "" : "~", joined); + } + + return 1; + } +#endif + if (streq(name, "CPUAffinity")) { + const void *a; + size_t n = 0; + + r = sd_bus_message_read_array(message, 'y', &a, &n); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (n == 0) { + c->cpuset = cpu_set_mfree(c->cpuset); + c->cpuset_ncpus = 0; + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *str = NULL; + size_t allocated = 0, len = 0, i, ncpus; + + ncpus = CPU_SIZE_TO_NUM(n); + + for (i = 0; i < ncpus; i++) { + _cleanup_free_ char *p = NULL; + size_t add; + + if (!CPU_ISSET_S(i, n, (cpu_set_t*) a)) + continue; + + r = asprintf(&p, "%zu", i); + if (r < 0) + return -ENOMEM; + + add = strlen(p); + + if (!GREEDY_REALLOC(str, allocated, len + add + 2)) + return -ENOMEM; + + strcpy(mempcpy(str + len, p, add), " "); + len += add + 1; + } + + if (len != 0) + str[len - 1] = '\0'; + + if (!c->cpuset || c->cpuset_ncpus < ncpus) { + cpu_set_t *cpuset; + + cpuset = CPU_ALLOC(ncpus); + if (!cpuset) + return -ENOMEM; + + CPU_ZERO_S(n, cpuset); + if (c->cpuset) { + CPU_OR_S(CPU_ALLOC_SIZE(c->cpuset_ncpus), cpuset, c->cpuset, (cpu_set_t*) a); + CPU_FREE(c->cpuset); + } else + CPU_OR_S(n, cpuset, cpuset, (cpu_set_t*) a); + + c->cpuset = cpuset; + c->cpuset_ncpus = ncpus; + } else + CPU_OR_S(n, c->cpuset, c->cpuset, (cpu_set_t*) a); + + unit_write_settingf(u, flags, name, "%s=%s", name, str); + } + } + + return 1; + + } else if (streq(name, "Nice")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!nice_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid Nice value: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->nice = q; + c->nice_set = true; + + unit_write_settingf(u, flags, name, "Nice=%i", q); + } + + return 1; + + } else if (streq(name, "CPUSchedulingPolicy")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!sched_policy_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling policy: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *s = NULL; + + r = sched_policy_to_string_alloc(q, &s); + if (r < 0) + return r; + + c->cpu_sched_policy = q; + c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(q), sched_get_priority_max(q)); + c->cpu_sched_set = true; + + unit_write_settingf(u, flags, name, "CPUSchedulingPolicy=%s", s); + } + + return 1; + + } else if (streq(name, "CPUSchedulingPriority")) { + int32_t p, min, max; + + r = sd_bus_message_read(message, "i", &p); + if (r < 0) + return r; + + min = sched_get_priority_min(c->cpu_sched_policy); + max = sched_get_priority_max(c->cpu_sched_policy); + if (p < min || p > max) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling priority: %i", p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_sched_priority = p; + c->cpu_sched_set = true; + + unit_write_settingf(u, flags, name, "CPUSchedulingPriority=%i", p); + } + + return 1; + + } else if (streq(name, "IOSchedulingClass")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!ioprio_class_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling class: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *s = NULL; + + r = ioprio_class_to_string_alloc(q, &s); + if (r < 0) + return r; + + c->ioprio = IOPRIO_PRIO_VALUE(q, IOPRIO_PRIO_DATA(c->ioprio)); + c->ioprio_set = true; + + unit_write_settingf(u, flags, name, "IOSchedulingClass=%s", s); + } + + return 1; + + } else if (streq(name, "IOSchedulingPriority")) { + int32_t p; + + r = sd_bus_message_read(message, "i", &p); + if (r < 0) + return r; + + if (!ioprio_priority_is_valid(p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling priority: %i", p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_PRIO_CLASS(c->ioprio), p); + c->ioprio_set = true; + + unit_write_settingf(u, flags, name, "IOSchedulingPriority=%i", p); + } + + return 1; + + } else if (streq(name, "WorkingDirectory")) { + const char *s; + bool missing_ok; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (s[0] == '-') { + missing_ok = true; + s++; + } else + missing_ok = false; + + if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (streq(s, "~")) { + c->working_directory = mfree(c->working_directory); + c->working_directory_home = true; + } else { + r = free_and_strdup(&c->working_directory, empty_to_null(s)); + if (r < 0) + return r; + + c->working_directory_home = false; + } + + c->working_directory_missing_ok = missing_ok; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s); + } + + return 1; + + } else if (STR_IN_SET(name, + "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s) && !fdname_is_valid(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (streq(name, "StandardInputFileDescriptorName")) { + r = free_and_strdup(c->stdio_fdname + STDIN_FILENO, empty_to_null(s)); + if (r < 0) + return r; + + c->std_input = EXEC_INPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=fd:%s", exec_context_fdname(c, STDIN_FILENO)); + + } else if (streq(name, "StandardOutputFileDescriptorName")) { + r = free_and_strdup(c->stdio_fdname + STDOUT_FILENO, empty_to_null(s)); + if (r < 0) + return r; + + c->std_output = EXEC_OUTPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=fd:%s", exec_context_fdname(c, STDOUT_FILENO)); + + } else { + assert(streq(name, "StandardErrorFileDescriptorName")); + + r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + c->std_error = EXEC_OUTPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=fd:%s", exec_context_fdname(c, STDERR_FILENO)); + } + } + + return 1; + + } else if (STR_IN_SET(name, + "StandardInputFile", + "StandardOutputFile", "StandardOutputFileToAppend", + "StandardErrorFile", "StandardErrorFileToAppend")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s)) { + if (!path_is_absolute(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute", s); + if (!path_is_normalized(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", s); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (streq(name, "StandardInputFile")) { + r = free_and_strdup(&c->stdio_file[STDIN_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + c->std_input = EXEC_INPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=file:%s", s); + + } else if (STR_IN_SET(name, "StandardOutputFile", "StandardOutputFileToAppend")) { + r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + if (streq(name, "StandardOutputFile")) { + c->std_output = EXEC_OUTPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=file:%s", s); + } else { + assert(streq(name, "StandardOutputFileToAppend")); + c->std_output = EXEC_OUTPUT_FILE_APPEND; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=append:%s", s); + } + } else { + assert(STR_IN_SET(name, "StandardErrorFile", "StandardErrorFileToAppend")); + + r = free_and_strdup(&c->stdio_file[STDERR_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + if (streq(name, "StandardErrorFile")) { + c->std_error = EXEC_OUTPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=file:%s", s); + } else { + assert(streq(name, "StandardErrorFileToAppend")); + c->std_error = EXEC_OUTPUT_FILE_APPEND; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=append:%s", s); + } + } + } + + return 1; + + } else if (streq(name, "StandardInputData")) { + const void *p; + size_t sz; + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (sz == 0) { + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + + unit_write_settingf(u, flags, name, "StandardInputData="); + } else { + void *q; + ssize_t n; + + if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) + return -E2BIG; + + n = base64mem(p, sz, &encoded); + if (n < 0) + return (int) n; + + q = realloc(c->stdin_data, c->stdin_data_size + sz); + if (!q) + return -ENOMEM; + + memcpy((uint8_t*) q + c->stdin_data_size, p, sz); + + c->stdin_data = q; + c->stdin_data_size += sz; + + unit_write_settingf(u, flags, name, "StandardInputData=%s", encoded); + } + } + + return 1; + + } else if (streq(name, "Environment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_is_valid(l)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment block."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->environment = strv_free(c->environment); + unit_write_setting(u, flags, name, "Environment="); + } else { + _cleanup_free_ char *joined = NULL; + char **e; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C); + if (!joined) + return -ENOMEM; + + e = strv_env_merge(2, c->environment, l); + if (!e) + return -ENOMEM; + + strv_free_and_replace(c->environment, e); + unit_write_settingf(u, flags, name, "Environment=%s", joined); + } + } + + return 1; + + } else if (streq(name, "UnsetEnvironment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(l)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UnsetEnvironment= list."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->unset_environment = strv_free(c->unset_environment); + unit_write_setting(u, flags, name, "UnsetEnvironment="); + } else { + _cleanup_free_ char *joined = NULL; + char **e; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C); + if (!joined) + return -ENOMEM; + + e = strv_env_merge(2, c->unset_environment, l); + if (!e) + return -ENOMEM; + + strv_free_and_replace(c->unset_environment, e); + unit_write_settingf(u, flags, name, "UnsetEnvironment=%s", joined); + } + } + + return 1; + + } else if (streq(name, "OOMScoreAdjust")) { + int oa; + + r = sd_bus_message_read(message, "i", &oa); + if (r < 0) + return r; + + if (!oom_score_adjust_is_valid(oa)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "OOM score adjust value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->oom_score_adjust = oa; + c->oom_score_adjust_set = true; + unit_write_settingf(u, flags, name, "OOMScoreAdjust=%i", oa); + } + + return 1; + + } else if (streq(name, "EnvironmentFiles")) { + + _cleanup_free_ char *joined = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **l = NULL; + size_t size = 0; + char **i; + + r = sd_bus_message_enter_container(message, 'a', "(sb)"); + if (r < 0) + return r; + + f = open_memstream(&joined, &size); + if (!f) + return -ENOMEM; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + fputs("EnvironmentFile=\n", f); + + STRV_FOREACH(i, c->environment_files) { + _cleanup_free_ char *q = NULL; + + q = specifier_escape(*i); + if (!q) + return -ENOMEM; + + fprintf(f, "EnvironmentFile=%s\n", q); + } + + while ((r = sd_bus_message_enter_container(message, 'r', "sb")) > 0) { + const char *path; + int b; + + r = sd_bus_message_read(message, "sb", &path, &b); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *q = NULL; + char *buf; + + buf = strjoin(b ? "-" : "", path); + if (!buf) + return -ENOMEM; + + q = specifier_escape(buf); + if (!q) { + free(buf); + return -ENOMEM; + } + + fprintf(f, "EnvironmentFile=%s\n", q); + + r = strv_consume(&l, buf); + if (r < 0) + return r; + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->environment_files = strv_free(c->environment_files); + unit_write_setting(u, flags, name, "EnvironmentFile="); + } else { + r = strv_extend_strv(&c->environment_files, l, true); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, joined); + } + } + + return 1; + + } else if (streq(name, "PassEnvironment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_name_is_valid(l)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PassEnvironment= block."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->pass_environment = strv_free(c->pass_environment); + unit_write_setting(u, flags, name, "PassEnvironment="); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&c->pass_environment, l, true); + if (r < 0) + return r; + + /* We write just the new settings out to file, with unresolved specifiers. */ + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "PassEnvironment=%s", joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "ReadWriteDirectories", "ReadOnlyDirectories", "InaccessibleDirectories", + "ReadWritePaths", "ReadOnlyPaths", "InaccessiblePaths")) { + _cleanup_strv_free_ char **l = NULL; + char ***dirs; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + char *i = *p; + size_t offset; + + offset = i[0] == '-'; + offset += i[offset] == '+'; + if (!path_is_absolute(i + offset)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); + + path_simplify(i + offset, false); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (STR_IN_SET(name, "ReadWriteDirectories", "ReadWritePaths")) + dirs = &c->read_write_paths; + else if (STR_IN_SET(name, "ReadOnlyDirectories", "ReadOnlyPaths")) + dirs = &c->read_only_paths; + else /* "InaccessiblePaths" */ + dirs = &c->inaccessible_paths; + + if (strv_isempty(l)) { + *dirs = strv_free(*dirs); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + r = strv_extend_strv(dirs, l, true); + if (r < 0) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "RuntimeDirectory", "StateDirectory", "CacheDirectory", "LogsDirectory", "ConfigurationDirectory")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!path_is_normalized(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is not normalized: %s", name, *p); + + if (path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is absolute: %s", name, *p); + + if (path_startswith(*p, "private")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path can't be 'private': %s", name, *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char ***dirs = NULL; + ExecDirectoryType i; + + for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) + if (streq(name, exec_directory_type_to_string(i))) { + dirs = &c->directories[i].paths; + break; + } + + assert(dirs); + + if (strv_isempty(l)) { + *dirs = strv_free(*dirs); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(dirs, l, true); + if (r < 0) + return -ENOMEM; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "AppArmorProfile", "SmackProcessLabel")) { + int ignore; + const char *s; + + r = sd_bus_message_read(message, "(bs)", &ignore, &s); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char **p; + bool *b; + + if (streq(name, "AppArmorProfile")) { + p = &c->apparmor_profile; + b = &c->apparmor_profile_ignore; + } else { /* "SmackProcessLabel" */ + p = &c->smack_process_label; + b = &c->smack_process_label_ignore; + } + + if (isempty(s)) { + *p = mfree(*p); + *b = false; + } else { + if (free_and_strdup(p, s) < 0) + return -ENOMEM; + *b = ignore; + } + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s%s", name, ignore ? "-" : "", strempty(s)); + } + + return 1; + + } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) { + const char *source, *destination; + int ignore_enoent; + uint64_t mount_flags; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ssbt)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ssbt)", &source, &destination, &ignore_enoent, &mount_flags)) > 0) { + + if (!path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source); + if (!path_is_absolute(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination); + if (!IN_SET(mount_flags, 0, MS_REC)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mount flags."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = strdup(source), + .destination = strdup(destination), + .read_only = !!strstr(name, "ReadOnly"), + .recursive = !!(mount_flags & MS_REC), + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s%s:%s:%s", + name, + ignore_enoent ? "-" : "", + source, + destination, + (mount_flags & MS_REC) ? "rbind" : "norbind"); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty) { + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "TemporaryFileSystem")) { + const char *path, *options; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &path, &options)) > 0) { + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Mount point %s is not absolute.", path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s:%s", + name, + path, + options); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty) { + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if ((suffix = startswith(name, "Limit"))) { + const char *soft = NULL; + int ri; + + ri = rlimit_from_string(suffix); + if (ri < 0) { + soft = endswith(suffix, "Soft"); + if (soft) { + const char *n; + + n = strndupa(suffix, soft - suffix); + ri = rlimit_from_string(n); + if (ri >= 0) + name = strjoina("Limit", n); + } + } + + if (ri >= 0) { + uint64_t rl; + rlim_t x; + + r = sd_bus_message_read(message, "t", &rl); + if (r < 0) + return r; + + if (rl == (uint64_t) -1) + x = RLIM_INFINITY; + else { + x = (rlim_t) rl; + + if ((uint64_t) x != rl) + return -ERANGE; + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *f = NULL; + struct rlimit nl; + + if (c->rlimit[ri]) { + nl = *c->rlimit[ri]; + + if (soft) + nl.rlim_cur = x; + else + nl.rlim_max = x; + } else + /* When the resource limit is not initialized yet, then assign the value to both fields */ + nl = (struct rlimit) { + .rlim_cur = x, + .rlim_max = x, + }; + + r = rlimit_format(&nl, &f); + if (r < 0) + return r; + + if (c->rlimit[ri]) + *c->rlimit[ri] = nl; + else { + c->rlimit[ri] = newdup(struct rlimit, &nl, 1); + if (!c->rlimit[ri]) + return -ENOMEM; + } + + unit_write_settingf(u, flags, name, "%s=%s", name, f); + } + + return 1; + } + + } + + return 0; +} diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h new file mode 100644 index 0000000..8405170 --- /dev/null +++ b/src/core/dbus-execute.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "execute.h" + +#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \ + SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \ + SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \ + SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags) + +#define BUS_EXEC_COMMAND_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command, offset, flags) + +#define BUS_EXEC_COMMAND_LIST_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command_list, offset, flags) + +extern const sd_bus_vtable bus_exec_vtable[]; + +int bus_property_get_exec_output(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_command(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); + +int bus_exec_context_set_transient_property(Unit *u, ExecContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_exec_command(Unit *u, const char *name, ExecCommand **exec_command, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c new file mode 100644 index 0000000..d11e58b --- /dev/null +++ b/src/core/dbus-job.c @@ -0,0 +1,312 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "dbus-job.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "job.h" +#include "log.h" +#include "selinux-access.h" +#include "string-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, job_type, JobType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_state, job_state, JobState); + +static int property_get_unit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Job *j = userdata; + + assert(bus); + assert(reply); + assert(j); + + p = unit_dbus_path(j->unit); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(so)", j->unit->id, p); +} + +int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Job *j = userdata; + int r; + + assert(message); + assert(j); + + r = mac_selinux_unit_access_check(j->unit, message, "stop", error); + if (r < 0) + return r; + + /* Access is granted to the job owner */ + if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) { + + /* And for everybody else consult polkit */ + r = bus_verify_manage_units_async(j->unit->manager, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + } + + job_finish_and_invalidate(j, JOB_CANCELED, true, false); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ Job **list = NULL; + Job *j = userdata; + int r, i, n; + + if (strstr(sd_bus_message_get_member(message), "After")) + n = job_get_after(j, &list); + else + n = job_get_before(j, &list); + if (n < 0) + return n; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(usssoo)"); + if (r < 0) + return r; + + for (i = 0; i < n; i ++) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + + job_path = job_dbus_path(list[i]); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(list[i]->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(usssoo)", + list[i]->id, + list[i]->unit->id, + job_type_to_string(list[i]->type), + job_state_to_string(list[i]->state), + job_path, + unit_path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +const sd_bus_vtable bus_job_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("Cancel", NULL, NULL, bus_job_method_cancel, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetAfter", NULL, "a(usssoo)", bus_job_method_get_waiting_jobs, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetBefore", NULL, "a(usssoo)", bus_job_method_get_waiting_jobs, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Job, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Unit", "(so)", property_get_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobType", "s", property_get_type, offsetof(Job, type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("State", "s", property_get_state, offsetof(Job, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; + +static int send_new_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Job *j = userdata; + int r; + + assert(bus); + assert(j); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobNew"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "uos", j->id, p, j->unit->id); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + _cleanup_free_ char *p = NULL; + Job *j = userdata; + + assert(bus); + assert(j); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + return sd_bus_emit_properties_changed(bus, p, "org.freedesktop.systemd1.Job", "State", NULL); +} + +void bus_job_send_change_signal(Job *j) { + int r; + + assert(j); + + /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */ + bus_unit_send_pending_change_signal(j->unit, true); + + if (j->in_dbus_queue) { + LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + } + + r = bus_foreach_bus(j->manager, j->bus_track, j->sent_dbus_new_signal ? send_changed_signal : send_new_signal, j); + if (r < 0) + log_debug_errno(r, "Failed to send job change signal for %u: %m", j->id); + + j->sent_dbus_new_signal = true; +} + +void bus_job_send_pending_change_signal(Job *j, bool including_new) { + assert(j); + + if (!j->in_dbus_queue) + return; + + if (!j->sent_dbus_new_signal && !including_new) + return; + + if (MANAGER_IS_RELOADING(j->unit->manager)) + return; + + bus_job_send_change_signal(j); +} + +static int send_removed_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Job *j = userdata; + int r; + + assert(bus); + assert(j); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "uoss", j->id, p, j->unit->id, job_result_to_string(j->result)); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +void bus_job_send_removed_signal(Job *j) { + int r; + + assert(j); + + if (!j->sent_dbus_new_signal) + bus_job_send_change_signal(j); + + /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */ + bus_unit_send_pending_change_signal(j->unit, true); + + r = bus_foreach_bus(j->manager, j->bus_track, send_removed_signal, j); + if (r < 0) + log_debug_errno(r, "Failed to send job remove signal for %u: %m", j->id); +} + +static int bus_job_track_handler(sd_bus_track *t, void *userdata) { + Job *j = userdata; + + assert(t); + assert(j); + + j->bus_track = sd_bus_track_unref(j->bus_track); /* make sure we aren't called again */ + + /* Last client dropped off the bus, maybe we should GC this now? */ + job_add_to_gc_queue(j); + return 0; +} + +static int bus_job_allocate_bus_track(Job *j) { + + assert(j); + + if (j->bus_track) + return 0; + + return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j); +} + +int bus_job_coldplug_bus_track(Job *j) { + int r = 0; + _cleanup_strv_free_ char **deserialized_clients = NULL; + + assert(j); + + deserialized_clients = TAKE_PTR(j->deserialized_clients); + + if (strv_isempty(deserialized_clients)) + return 0; + + if (!j->manager->api_bus) + return 0; + + r = bus_job_allocate_bus_track(j); + if (r < 0) + return r; + + return bus_track_add_name_many(j->bus_track, deserialized_clients); +} + +int bus_job_track_sender(Job *j, sd_bus_message *m) { + int r; + + assert(j); + assert(m); + + if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) { + j->ref_by_private_bus = true; + return 0; + } + + r = bus_job_allocate_bus_track(j); + if (r < 0) + return r; + + return sd_bus_track_add_sender(j->bus_track, m); +} diff --git a/src/core/dbus-job.h b/src/core/dbus-job.h new file mode 100644 index 0000000..c9f6fc7 --- /dev/null +++ b/src/core/dbus-job.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "job.h" + +extern const sd_bus_vtable bus_job_vtable[]; + +int bus_job_method_cancel(sd_bus_message *message, void *job, sd_bus_error *error); +int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error); + +void bus_job_send_change_signal(Job *j); +void bus_job_send_pending_change_signal(Job *j, bool including_new); +void bus_job_send_removed_signal(Job *j); + +int bus_job_coldplug_bus_track(Job *j); +int bus_job_track_sender(Job *j, sd_bus_message *m); diff --git a/src/core/dbus-kill.c b/src/core/dbus-kill.c new file mode 100644 index 0000000..e2b3a0d --- /dev/null +++ b/src/core/dbus-kill.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "bus-util.h" +#include "dbus-kill.h" +#include "dbus-util.h" +#include "kill.h" +#include "signal-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_kill_mode, kill_mode, KillMode); + +const sd_bus_vtable bus_kill_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("KillMode", "s", property_get_kill_mode, offsetof(KillContext, kill_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KillSignal", "i", bus_property_get_int, offsetof(KillContext, kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FinalKillSignal", "i", bus_property_get_int, offsetof(KillContext, final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendSIGKILL", "b", bus_property_get_bool, offsetof(KillContext, send_sigkill), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendSIGHUP", "b", bus_property_get_bool, offsetof(KillContext, send_sighup), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogSignal", "i", bus_property_get_int, offsetof(KillContext, watchdog_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static BUS_DEFINE_SET_TRANSIENT_PARSE(kill_mode, KillMode, kill_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(final_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(watchdog_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); + +int bus_kill_context_set_transient_property( + Unit *u, + KillContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "KillMode")) + return bus_set_transient_kill_mode(u, name, &c->kill_mode, message, flags, error); + + if (streq(name, "SendSIGHUP")) + return bus_set_transient_bool(u, name, &c->send_sighup, message, flags, error); + + if (streq(name, "SendSIGKILL")) + return bus_set_transient_bool(u, name, &c->send_sigkill, message, flags, error); + + if (streq(name, "KillSignal")) + return bus_set_transient_kill_signal(u, name, &c->kill_signal, message, flags, error); + + if (streq(name, "FinalKillSignal")) + return bus_set_transient_final_kill_signal(u, name, &c->final_kill_signal, message, flags, error); + + if (streq(name, "WatchdogSignal")) + return bus_set_transient_watchdog_signal(u, name, &c->watchdog_signal, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-kill.h b/src/core/dbus-kill.h new file mode 100644 index 0000000..8192e94 --- /dev/null +++ b/src/core/dbus-kill.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "kill.h" +#include "unit.h" + +extern const sd_bus_vtable bus_kill_vtable[]; + +int bus_kill_context_set_transient_property(Unit *u, KillContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c new file mode 100644 index 0000000..88e4c6b --- /dev/null +++ b/src/core/dbus-manager.c @@ -0,0 +1,2718 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <sys/prctl.h> +#include <sys/statvfs.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "architecture.h" +#include "build.h" +#include "bus-common-errors.h" +#include "dbus-execute.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "install.h" +#include "log.h" +#include "os-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "selinux-access.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "user-util.h" +#include "virt.h" +#include "watchdog.h" + +/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state there, and if + * we can't we'll fail badly. */ +#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024)) + +static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) { + return (runtime ? UNIT_FILE_RUNTIME : 0) | + (force ? UNIT_FILE_FORCE : 0); +} + +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", SYSTEMD_FEATURES); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture())); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_log_target, "s", log_target_to_string(log_get_target())); +static BUS_DEFINE_PROPERTY_GET2(property_get_system_state, "s", Manager, manager_state, manager_state_to_string); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_timer_slack_nsec, "t", (uint64_t) prctl(PR_GET_TIMERSLACK)); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "u", Hashmap *, hashmap_size); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_set_size, "u", Set *, set_size); + +static int property_get_virtualization( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + int v; + + assert(bus); + assert(reply); + + v = detect_virtualization(); + + /* Make sure to return the empty string when we detect no virtualization, as that is the API. + * + * https://github.com/systemd/systemd/issues/1423 + */ + + return sd_bus_message_append( + reply, "s", + v == VIRTUALIZATION_NONE ? NULL : virtualization_to_string(v)); +} + +static int property_get_tainted( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *s = NULL; + Manager *m = userdata; + + assert(bus); + assert(reply); + assert(m); + + s = manager_taint_string(m); + if (!s) + return log_oom(); + + return sd_bus_message_append(reply, "s", s); +} + +static int property_set_log_target( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + manager_restore_original_log_target(m); + else { + LogTarget target; + + target = log_target_from_string(t); + if (target < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t); + + manager_override_log_target(m, target); + } + + return 0; +} + +static int property_get_log_level( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *t = NULL; + int r; + + assert(bus); + assert(reply); + + r = log_level_to_string_alloc(log_get_max_level(), &t); + if (r < 0) + return r; + + return sd_bus_message_append(reply, "s", t); +} + +static int property_set_log_level( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + manager_restore_original_log_level(m); + else { + int level; + + level = log_level_from_string(t); + if (level < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t); + + manager_override_log_level(m, level); + } + + return 0; +} + +static int property_get_progress( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + double d; + + assert(bus); + assert(reply); + assert(m); + + if (MANAGER_IS_FINISHED(m)) + d = 1.0; + else + d = 1.0 - ((double) hashmap_size(m->jobs) / (double) m->n_installed_jobs); + + return sd_bus_message_append(reply, "d", d); +} + +static int property_get_environment( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + int r; + + assert(bus); + assert(reply); + assert(m); + + r = manager_get_effective_environment(m, &l); + if (r < 0) + return r; + + return sd_bus_message_append_strv(reply, l); +} + +static int property_get_show_status( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + int b; + + assert(bus); + assert(reply); + assert(m); + + b = IN_SET(m->show_status, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES); + return sd_bus_message_append_basic(reply, 'b', &b); +} + +static int property_set_runtime_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + usec_t *t = userdata; + int r; + + assert(bus); + assert(value); + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + r = sd_bus_message_read(value, "t", t); + if (r < 0) + return r; + + return watchdog_set_timeout(t); +} + +static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) { + Unit *u; + int r; + + assert(m); + assert(message); + assert(ret_unit); + + /* More or less a wrapper around manager_get_unit() that generates nice errors and has one trick up its sleeve: + * if the name is specified empty we use the client's unit. */ + + if (isempty(name)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit."); + } else { + u = manager_get_unit(m, name); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", name); + } + + *ret_unit = u; + return 0; +} + +static int bus_load_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) { + assert(m); + assert(message); + assert(ret_unit); + + /* Pretty much the same as bus_get_unit_by_name(), but we also load the unit if necessary. */ + + if (isempty(name)) + return bus_get_unit_by_name(m, message, name, ret_unit, error); + + return manager_load_unit(m, name, NULL, error, ret_unit); +} + +static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + pid_t pid; + Unit *u; + int r; + + assert(message); + assert(m); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "u", &pid); + if (r < 0) + return r; + if (pid < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid); + + if (pid == 0) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + sd_id128_t id; + const void *a; + Unit *u; + size_t sz; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read_array(message, 'y', &a, &sz); + if (r < 0) + return r; + if (sz == 0) + id = SD_ID128_NULL; + else if (sz == 16) + memcpy(&id, a, sz); + else + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID"); + + if (sd_id128_is_null(id)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Client " PID_FMT " not member of any unit.", pid); + } else { + u = hashmap_get(m->units_by_invocation_id, &id); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id)); + } + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + /* So here's a special trick: the bus path we return actually references the unit by its invocation ID instead + * of the unit name. This means it stays valid only as long as the invocation ID stays the same. */ + path = unit_dbus_path_invocation_id(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit_by_control_group(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + const char *cgroup; + Unit *u; + int r; + + r = sd_bus_message_read(message, "s", &cgroup); + if (r < 0) + return r; + + u = manager_get_unit_by_cgroup(m, cgroup); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Control group '%s' is not valid or not managed by this instance", cgroup); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_load_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_start_unit_generic(sd_bus_message *message, Manager *m, JobType job_type, bool reload_if_possible, sd_bus_error *error) { + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_load_unit(m, name, NULL, error, &u); + if (r < 0) + return r; + + return bus_unit_method_start_generic(message, u, job_type, reload_if_possible, error); +} + +static int method_start_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_START, false, error); +} + +static int method_stop_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_STOP, false, error); +} + +static int method_reload_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RELOAD, false, error); +} + +static int method_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RESTART, false, error); +} + +static int method_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, false, error); +} + +static int method_reload_or_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RESTART, true, error); +} + +static int method_reload_or_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, true, error); +} + +static int method_start_unit_replace(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *old_name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &old_name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, old_name, &u, error); + if (r < 0) + return r; + if (!u->job || u->job->type != JOB_START) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "No job queued for unit %s", old_name); + + return method_start_unit_generic(message, m, JOB_START, false, error); +} + +static int method_kill_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return bus_unit_method_kill(message, u, error); +} + +static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return bus_unit_method_reset_failed(message, u, error); +} + +static int method_set_unit_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_load_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + r = bus_unit_validate_load_state(u, error); + if (r < 0) + return r; + + return bus_unit_method_set_properties(message, u, error); +} + +static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_load_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + r = bus_unit_validate_load_state(u, error); + if (r < 0) + return r; + + return bus_unit_method_ref(message, u, error); +} + +static int method_unref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_load_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + r = bus_unit_validate_load_state(u, error); + if (r < 0) + return r; + + return bus_unit_method_unref(message, u, error); +} + +static int reply_unit_info(sd_bus_message *reply, Unit *u) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + Unit *following; + + following = unit_following(u); + + unit_path = unit_dbus_path(u); + if (!unit_path) + return -ENOMEM; + + if (u->job) { + job_path = job_dbus_path(u->job); + if (!job_path) + return -ENOMEM; + } + + return sd_bus_message_append( + reply, "(ssssssouso)", + u->id, + unit_description(u), + unit_load_state_to_string(u->load_state), + unit_active_state_to_string(unit_active_state(u)), + unit_sub_state_to_string(u), + following ? following->id : "", + unit_path, + u->job ? u->job->id : 0, + u->job ? job_type_to_string(u->job->type) : "", + job_path ? job_path : "/"); +} + +static int method_list_units_by_names(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + int r; + char **unit; + _cleanup_strv_free_ char **units = NULL; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &units); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)"); + if (r < 0) + return r; + + STRV_FOREACH(unit, units) { + Unit *u; + + if (!unit_name_is_valid(*unit, UNIT_NAME_ANY)) + continue; + + r = bus_load_unit_by_name(m, message, *unit, &u, error); + if (r < 0) + return r; + + r = reply_unit_info(reply, u); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return bus_unit_method_get_processes(message, u, error); +} + +static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return bus_unit_method_attach_processes(message, u, error); +} + +static int transient_unit_from_message( + Manager *m, + sd_bus_message *message, + const char *name, + Unit **unit, + sd_bus_error *error) { + + UnitType t; + Unit *u; + int r; + + assert(m); + assert(message); + assert(name); + + t = unit_name_to_type(name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name or type."); + + if (!unit_vtable[t]->can_transient) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit type %s does not support transient units.", unit_type_to_string(t)); + + r = manager_load_unit(m, name, NULL, error, &u); + if (r < 0) + return r; + + if (!unit_is_pristine(u)) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, "Unit %s already exists.", name); + + /* OK, the unit failed to load and is unreferenced, now let's + * fill in the transient data instead */ + r = unit_make_transient(u); + if (r < 0) + return r; + + /* Set our properties */ + r = bus_unit_set_properties(u, message, UNIT_RUNTIME, false, error); + if (r < 0) + return r; + + /* If the client asked for it, automatically add a reference to this unit. */ + if (u->bus_track_add) { + r = bus_unit_track_add_sender(u, message); + if (r < 0) + return log_error_errno(r, "Failed to watch sender: %m"); + } + + /* Now load the missing bits of the unit we just created */ + unit_add_to_load_queue(u); + manager_dispatch_load_queue(m); + + *unit = u; + + return 0; +} + +static int transient_aux_units_from_message( + Manager *m, + sd_bus_message *message, + sd_bus_error *error) { + + int r; + + assert(m); + assert(message); + + r = sd_bus_message_enter_container(message, 'a', "(sa(sv))"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, 'r', "sa(sv)")) > 0) { + const char *name = NULL; + Unit *u; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = transient_unit_from_message(m, message, name, &u, error); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 0; +} + +static int method_start_transient_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *name, *smode; + Manager *m = userdata; + JobMode mode; + Unit *u; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ss", &name, &smode); + if (r < 0) + return r; + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s is invalid.", smode); + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = transient_unit_from_message(m, message, name, &u, error); + if (r < 0) + return r; + + r = transient_aux_units_from_message(m, message, error); + if (r < 0) + return r; + + /* Finally, start it */ + return bus_unit_queue_job(message, u, JOB_START, mode, false, error); +} + +static int method_get_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + uint32_t id; + Job *j; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + r = mac_selinux_unit_access_check(j->unit, message, "status", error); + if (r < 0) + return r; + + path = job_dbus_path(j); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_cancel_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint32_t id; + Job *j; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + return bus_job_method_cancel(message, j, error); +} + +static int method_clear_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + manager_clear_jobs(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + manager_reset_failed(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + const char *k; + Iterator i; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(u, k, m->units, i) { + if (k != u->id) + continue; + + if (!strv_isempty(states) && + !strv_contains(states, unit_load_state_to_string(u->load_state)) && + !strv_contains(states, unit_active_state_to_string(unit_active_state(u))) && + !strv_contains(states, unit_sub_state_to_string(u))) + continue; + + if (!strv_isempty(patterns) && + !strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE)) + continue; + + r = reply_unit_info(reply, u); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_units(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return list_units_filtered(message, userdata, error, NULL, NULL); +} + +static int method_list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + return list_units_filtered(message, userdata, error, states, NULL); +} + +static int method_list_units_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return list_units_filtered(message, userdata, error, states, patterns); +} + +static int method_list_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + Iterator i; + Job *j; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(usssoo)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(j, m->jobs, i) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + + job_path = job_dbus_path(j); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(j->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append( + reply, "(usssoo)", + j->id, + j->unit->id, + job_type_to_string(j->type), + job_state_to_string(j->state), + job_path, + unit_path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_subscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + if (sd_bus_message_get_bus(message) == m->api_bus) { + + /* Note that direct bus connection subscribe by + * default, we only track peers on the API bus here */ + + if (!m->subscribed) { + r = sd_bus_track_new(sd_bus_message_get_bus(message), &m->subscribed, NULL, NULL); + if (r < 0) + return r; + } + + r = sd_bus_track_add_sender(m->subscribed, message); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, BUS_ERROR_ALREADY_SUBSCRIBED, "Client is already subscribed."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unsubscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + if (sd_bus_message_get_bus(message) == m->api_bus) { + r = sd_bus_track_remove_sender(m->subscribed, message); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, BUS_ERROR_NOT_SUBSCRIBED, "Client is not subscribed."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int dump_impl(sd_bus_message *message, void *userdata, sd_bus_error *error, int (*reply)(sd_bus_message *, char *)) { + _cleanup_free_ char *dump = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = manager_get_dump_string(m, &dump); + if (r < 0) + return r; + + return reply(message, dump); +} + +static int reply_dump(sd_bus_message *message, char *dump) { + return sd_bus_reply_method_return(message, "s", dump); +} + +static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_impl(message, userdata, error, reply_dump); +} + +static int reply_dump_by_fd(sd_bus_message *message, char *dump) { + _cleanup_close_ int fd = -1; + + fd = acquire_data_fd(dump, strlen(dump), 0); + if (fd < 0) + return fd; + + return sd_bus_reply_method_return(message, "h", fd); +} + +static int method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_impl(message, userdata, error, reply_dump_by_fd); +} + +static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed."); +} + +static int verify_run_space(const char *message, sd_bus_error *error) { + struct statvfs svfs; + uint64_t available; + + if (statvfs("/run/systemd", &svfs) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m"); + + available = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize; + + if (available < RELOAD_DISK_SPACE_MIN) { + char fb_available[FORMAT_BYTES_MAX], fb_need[FORMAT_BYTES_MAX]; + return sd_bus_error_setf(error, + BUS_ERROR_DISK_FULL, + "%s, not enough space available on /run/systemd. " + "Currently, %s are free, but a safety buffer of %s is enforced.", + message, + format_bytes(fb_available, sizeof(fb_available), available), + format_bytes(fb_need, sizeof(fb_need), RELOAD_DISK_SPACE_MIN)); + } + + return 0; +} + +int verify_run_space_and_log(const char *message) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + r = verify_run_space(message, &error); + if (r < 0) + return log_error_errno(r, "%s", bus_error_message(&error, r)); + + return 0; +} + +static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = verify_run_space("Refusing to reload", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_reload_daemon_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* Instead of sending the reply back right away, we just + * remember that we need to and then send it after the reload + * is finished. That way the caller knows when the reload + * finished. */ + + assert(!m->pending_reload_message); + r = sd_bus_message_new_method_return(message, &m->pending_reload_message); + if (r < 0) + return r; + + m->objective = MANAGER_RELOAD; + + return 1; +} + +static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = verify_run_space("Refusing to reexecute", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_reload_daemon_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* We don't send a reply back here, the client should + * just wait for us disconnecting. */ + + m->objective = MANAGER_REEXECUTE; + return 1; +} + +static int method_exit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + /* Exit() (in contrast to SetExitCode()) is actually allowed even if + * we are running on the host. It will fall back on reboot() in + * systemd-shutdown if it cannot do the exit() because it isn't a + * container. */ + + m->objective = MANAGER_EXIT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Reboot is only supported for system managers."); + + m->objective = MANAGER_REBOOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Powering off is only supported for system managers."); + + m->objective = MANAGER_POWEROFF; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Halt is only supported for system managers."); + + m->objective = MANAGER_HALT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_kexec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "KExec is only supported for system managers."); + + m->objective = MANAGER_KEXEC; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_error *error) { + char *ri = NULL, *rt = NULL; + const char *root, *init; + Manager *m = userdata; + struct statvfs svfs; + uint64_t available; + int r; + + assert(message); + assert(m); + + if (statvfs("/run/systemd", &svfs) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m"); + + available = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize; + + if (available < RELOAD_DISK_SPACE_MIN) { + char fb_available[FORMAT_BYTES_MAX], fb_need[FORMAT_BYTES_MAX]; + log_warning("Dangerously low amount of free space on /run/systemd, root switching operation might not complete successfully. " + "Currently, %s are free, but %s are suggested. Proceeding anyway.", + format_bytes(fb_available, sizeof(fb_available), available), + format_bytes(fb_need, sizeof(fb_need), RELOAD_DISK_SPACE_MIN)); + } + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Root switching is only supported by system manager."); + + r = sd_bus_message_read(message, "ss", &root, &init); + if (r < 0) + return r; + + if (isempty(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "New root directory may not be the empty string."); + if (!path_is_absolute(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "New root path '%s' is not absolute.", root); + if (path_equal(root, "/")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "New root directory cannot be the old root directory."); + + /* Safety check */ + if (isempty(init)) { + r = path_is_os_tree(root); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether root path '%s' contains an OS tree: %m", root); + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Specified switch root path '%s' does not seem to be an OS tree. os-release file is missing.", root); + } else { + _cleanup_free_ char *chased = NULL; + + if (!path_is_absolute(init)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path to init binary '%s' not absolute.", init); + + r = chase_symlinks(init, root, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &chased); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Could not resolve init executable %s: %m", init); + + if (laccess(chased, X_OK) < 0) { + if (errno == EACCES) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Init binary %s is not executable.", init); + + return sd_bus_error_set_errnof(error, r, "Could not check whether init binary %s is executable: %m", init); + } + } + + rt = strdup(root); + if (!rt) + return -ENOMEM; + + if (!isempty(init)) { + ri = strdup(init); + if (!ri) { + free(rt); + return -ENOMEM; + } + } + + free(m->switch_root); + m->switch_root = rt; + + free(m->switch_root_init); + m->switch_root_init = ri; + + m->objective = MANAGER_SWITCH_ROOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **plus = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &plus); + if (r < 0) + return r; + if (!strv_env_is_valid(plus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, NULL, plus); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unset_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **minus = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &minus); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(minus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment variable names or assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, minus, NULL); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unset_and_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **minus = NULL, **plus = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &minus); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &plus); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(minus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment variable names or assignments"); + if (!strv_env_is_valid(plus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, minus, plus); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_exit_code(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint8_t code; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "exit", error); + if (r < 0) + return r; + + r = sd_bus_message_read_basic(message, 'y', &code); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "ExitCode can only be set for user service managers or in containers."); + + m->return_value = code; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_lookup_dynamic_user_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + uid_t uid; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_basic(message, 's', &name); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Dynamic users are only supported in the system instance."); + if (!valid_user_group_name(name)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User name invalid: %s", name); + + r = dynamic_user_lookup_name(m, name, &uid); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER, "Dynamic user %s does not exist.", name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "u", (uint32_t) uid); +} + +static int method_lookup_dynamic_user_by_uid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *name = NULL; + Manager *m = userdata; + uid_t uid; + int r; + + assert(message); + assert(m); + + assert_cc(sizeof(uid) == sizeof(uint32_t)); + r = sd_bus_message_read_basic(message, 'u', &uid); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Dynamic users are only supported in the system instance."); + if (!uid_is_valid(uid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User ID invalid: " UID_FMT, uid); + + r = dynamic_user_lookup_uid(m, uid, &name); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER, "Dynamic user ID " UID_FMT " does not exist.", uid); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", name); +} + +static int method_get_dynamic_users(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + DynamicUser *d; + Iterator i; + int r; + + assert(message); + assert(m); + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Dynamic users are only supported in the system instance."); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(us)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(d, m->dynamic_users, i) { + uid_t uid; + + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) /* not realized yet? */ + continue; + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Failed to lookup a dynamic user."); + + r = sd_bus_message_append(reply, "(us)", uid, d->name); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + UnitFileList *item; + Hashmap *h; + Iterator i; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + h = hashmap_new(&string_hash_ops); + if (!h) + return -ENOMEM; + + r = unit_file_get_list(m->unit_file_scope, NULL, h, states, patterns); + if (r < 0) + goto fail; + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + goto fail; + + HASHMAP_FOREACH(item, h, i) { + + r = sd_bus_message_append(reply, "(ss)", item->path, unit_file_state_to_string(item->state)); + if (r < 0) + goto fail; + } + + unit_file_list_free(h); + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); + +fail: + unit_file_list_free(h); + return r; +} + +static int method_list_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return list_unit_files_by_patterns(message, userdata, error, NULL, NULL); +} + +static int method_list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return list_unit_files_by_patterns(message, userdata, error, states, patterns); +} + +static int method_get_unit_file_state(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + UnitFileState state; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = unit_file_get_state(m->unit_file_scope, NULL, name, &state); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", unit_file_state_to_string(state)); +} + +static int method_get_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *default_target = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = unit_file_get_default(m->unit_file_scope, NULL, &default_target); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", default_target); +} + +static int send_unit_files_changed(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "UnitFilesChanged"); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +/* Create an error reply, using the error information from changes[] + * if possible, and fall back to generating an error from error code c. + * The error message only describes the first error. + * + * Coordinate with unit_file_dump_changes() in install.c. + */ +static int install_error( + sd_bus_error *error, + int c, + UnitFileChange *changes, + size_t n_changes) { + + size_t i; + int r; + + for (i = 0; i < n_changes; i++) + + switch(changes[i].type) { + + case 0 ... INT_MAX: + continue; + + case -EEXIST: + if (changes[i].source) + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "File %s already exists and is a symlink to %s.", + changes[i].path, changes[i].source); + else + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "File %s already exists.", + changes[i].path); + goto found; + + case -ERFKILL: + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, + "Unit file %s is masked.", changes[i].path); + goto found; + + case -EADDRNOTAVAIL: + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED, + "Unit %s is transient or generated.", changes[i].path); + goto found; + + case -ELOOP: + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED, + "Refusing to operate on linked unit file %s", changes[i].path); + goto found; + + case -ENOENT: + r = sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit file %s does not exist.", changes[i].path); + goto found; + + default: + r = sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path); + goto found; + } + + r = c < 0 ? c : -EINVAL; + + found: + unit_file_changes_free(changes, n_changes); + return r; +} + +static int reply_unit_file_changes_and_free( + Manager *m, + sd_bus_message *message, + int carries_install_info, + UnitFileChange *changes, + size_t n_changes, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + bool bad = false, good = false; + size_t i; + int r; + + if (unit_file_changes_have_modification(changes, n_changes)) { + r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m"); + } + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + goto fail; + + if (carries_install_info >= 0) { + r = sd_bus_message_append(reply, "b", carries_install_info); + if (r < 0) + goto fail; + } + + r = sd_bus_message_open_container(reply, 'a', "(sss)"); + if (r < 0) + goto fail; + + for (i = 0; i < n_changes; i++) { + + if (changes[i].type < 0) { + bad = true; + continue; + } + + r = sd_bus_message_append( + reply, "(sss)", + unit_file_change_type_to_string(changes[i].type), + changes[i].path, + changes[i].source); + if (r < 0) + goto fail; + + good = true; + } + + /* If there was a failed change, and no successful change, then return the first failure as proper method call + * error. */ + if (bad && !good) + return install_error(error, 0, changes, n_changes); + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto fail; + + unit_file_changes_free(changes, n_changes); + return sd_bus_send(NULL, reply, NULL); + +fail: + unit_file_changes_free(changes, n_changes); + return r; +} + +static int method_enable_unit_files_generic( + sd_bus_message *message, + Manager *m, + int (*call)(UnitFileScope scope, UnitFileFlags flags, const char *root_dir, char *files[], UnitFileChange **changes, size_t *n_changes), + bool carries_install_info, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + UnitFileFlags flags; + int runtime, force, r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "bb", &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = call(m->unit_file_scope, flags, NULL, l, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error); +} + +static int method_enable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_enable, true, error); +} + +static int method_reenable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_reenable, true, error); +} + +static int method_link_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_link, false, error); +} + +static int unit_file_preset_without_mode(UnitFileScope scope, UnitFileFlags flags, const char *root_dir, char **files, UnitFileChange **changes, size_t *n_changes) { + return unit_file_preset(scope, flags, root_dir, files, UNIT_FILE_PRESET_FULL, changes, n_changes); +} + +static int method_preset_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_preset_without_mode, true, error); +} + +static int method_mask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_mask, false, error); +} + +static int method_preset_unit_files_with_mode(sd_bus_message *message, void *userdata, sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + UnitFilePresetMode mm; + int runtime, force, r; + UnitFileFlags flags; + const char *mode; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + if (isempty(mode)) + mm = UNIT_FILE_PRESET_FULL; + else { + mm = unit_file_preset_mode_from_string(mode); + if (mm < 0) + return -EINVAL; + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_preset(m->unit_file_scope, flags, NULL, l, mm, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, r, changes, n_changes, error); +} + +static int method_disable_unit_files_generic( + sd_bus_message *message, + Manager *m, + int (*call)(UnitFileScope scope, UnitFileFlags flags, const char *root_dir, char *files[], UnitFileChange **changes, size_t *n_changes), + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + int r, runtime; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = call(m->unit_file_scope, runtime ? UNIT_FILE_RUNTIME : 0, NULL, l, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_disable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_disable, error); +} + +static int method_unmask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_unmask, error); +} + +static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_revert(m->unit_file_scope, NULL, l, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_set_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) { + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + const char *name; + int force, r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "enable", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sb", &name, &force); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_set_default(m->unit_file_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + UnitFilePresetMode mm; + const char *mode; + UnitFileFlags flags; + int force, runtime, r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "enable", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + if (isempty(mode)) + mm = UNIT_FILE_PRESET_FULL; + else { + mm = unit_file_preset_mode_from_string(mode); + if (mm < 0) + return -EINVAL; + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_preset_all(m->unit_file_scope, flags, NULL, mm, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_add_dependency_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + int runtime, force, r; + char *target, *type; + UnitDependency dep; + UnitFileFlags flags; + + assert(message); + assert(m); + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssbb", &target, &type, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + dep = unit_dependency_from_string(type); + if (dep < 0) + return -EINVAL; + + r = unit_file_add_dependency(m->unit_file_scope, flags, NULL, l, target, dep, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_get_unit_file_links(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0, i; + UnitFileFlags flags; + const char *name; + char **p; + int runtime, r; + + r = sd_bus_message_read(message, "sb", &name, &runtime); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, SD_BUS_TYPE_ARRAY, "s"); + if (r < 0) + return r; + + p = STRV_MAKE(name); + flags = UNIT_FILE_DRY_RUN | + (runtime ? UNIT_FILE_RUNTIME : 0); + + r = unit_file_disable(UNIT_FILE_SYSTEM, flags, NULL, p, &changes, &n_changes); + if (r < 0) + return log_error_errno(r, "Failed to get file links for %s: %m", name); + + for (i = 0; i < n_changes; i++) + if (changes[i].type == UNIT_FILE_UNLINK) { + r = sd_bus_message_append(reply, "s", changes[i].path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_job_waiting(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint32_t id; + Job *j; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + return bus_job_method_get_waiting_jobs(message, j, error); +} + +static int method_abandon_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + if (u->type != UNIT_SCOPE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit '%s' is not a scope unit, refusing.", name); + + return bus_scope_method_abandon(message, u, error); +} + +const sd_bus_vtable bus_manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Version", "s", property_get_version, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Features", "s", property_get_features, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Virtualization", "s", property_get_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Architecture", "s", property_get_architecture, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Tainted", "s", property_get_tainted, 0, SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("FirmwareTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FIRMWARE]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("LoaderTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_LOADER]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("KernelTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_KERNEL]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", property_get_log_level, property_set_log_level, 0, 0), + SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", property_get_log_target, property_set_log_target, 0, 0), + SD_BUS_PROPERTY("NNames", "u", property_get_hashmap_size, offsetof(Manager, units), 0), + SD_BUS_PROPERTY("NFailedUnits", "u", property_get_set_size, offsetof(Manager, failed_units), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0), + SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0), + SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0), + SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0), + SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0), + SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ShowStatus", "b", property_get_show_status, 0, 0), + SD_BUS_PROPERTY("UnitPath", "as", NULL, offsetof(Manager, lookup_paths.search_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStandardOutput", "s", bus_property_get_exec_output, offsetof(Manager, default_std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStandardError", "s", bus_property_get_exec_output, offsetof(Manager, default_std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogUSec", "t", bus_property_get_usec, property_set_runtime_watchdog, offsetof(Manager, runtime_watchdog), 0), + SD_BUS_WRITABLE_PROPERTY("ShutdownWatchdogUSec", "t", bus_property_get_usec, bus_property_set_usec, offsetof(Manager, shutdown_watchdog), 0), + SD_BUS_WRITABLE_PROPERTY("ServiceWatchdogs", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, service_watchdogs), 0), + SD_BUS_PROPERTY("ControlGroup", "s", NULL, offsetof(Manager, cgroup_root), 0), + SD_BUS_PROPERTY("SystemState", "s", property_get_system_state, 0, 0), + SD_BUS_PROPERTY("ExitCode", "y", bus_property_get_unsigned, offsetof(Manager, return_value), 0), + SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, default_timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, default_restart_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST), + /* The following two items are obsolete alias */ + SD_BUS_PROPERTY("DefaultStartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("DefaultStartLimitInterval", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("DefaultStartLimitBurst", "u", bus_property_get_unsigned, offsetof(Manager, default_start_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultCPUAccounting", "b", bus_property_get_bool, offsetof(Manager, default_cpu_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultBlockIOAccounting", "b", bus_property_get_bool, offsetof(Manager, default_blockio_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultMemoryAccounting", "b", bus_property_get_bool, offsetof(Manager, default_memory_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTasksAccounting", "b", bus_property_get_bool, offsetof(Manager, default_tasks_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCPU", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCPUSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitFSIZE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitDATA", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitDATASoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSTACK", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCORE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCORESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRSS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRSSSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNOFILE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitAS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitASSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNPROC", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitLOCKS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNICE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNICESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTPRIO", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTasksMax", "t", NULL, offsetof(Manager, default_tasks_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + + SD_BUS_METHOD("GetUnit", "s", "o", method_get_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitByPID", "u", "o", method_get_unit_by_pid, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitByInvocationID", "ay", "o", method_get_unit_by_invocation_id, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitByControlGroup", "s", "o", method_get_unit_by_control_group, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("LoadUnit", "s", "o", method_load_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("StartUnit", "ss", "o", method_start_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("StartUnitReplace", "sss", "o", method_start_unit_replace, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("StopUnit", "ss", "o", method_stop_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReloadUnit", "ss", "o", method_reload_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("RestartUnit", "ss", "o", method_restart_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("TryRestartUnit", "ss", "o", method_try_restart_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReloadOrRestartUnit", "ss", "o", method_reload_or_restart_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReloadOrTryRestartUnit", "ss", "o", method_reload_or_try_restart_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("KillUnit", "ssi", NULL, method_kill_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailedUnit", "s", NULL, method_reset_failed_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("SetUnitProperties", "sba(sv)", NULL, method_set_unit_properties, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("RefUnit", "s", NULL, method_ref_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("UnrefUnit", "s", NULL, method_unref_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("StartTransientUnit", "ssa(sv)a(sa(sv))", "o", method_start_transient_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitProcesses", "s", "a(sus)", method_get_unit_processes, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("AttachProcessesToUnit", "ssau", NULL, method_attach_processes_to_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("AbandonScope", "s", NULL, method_abandon_scope, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetJob", "u", "o", method_get_job, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetJobAfter", "u", "a(usssoo)", method_get_job_waiting, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetJobBefore", "u", "a(usssoo)", method_get_job_waiting, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("CancelJob", "u", NULL, method_cancel_job, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ClearJobs", NULL, NULL, method_clear_jobs, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailed", NULL, NULL, method_reset_failed, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListUnits", NULL, "a(ssssssouso)", method_list_units, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListUnitsFiltered", "as", "a(ssssssouso)", method_list_units_filtered, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListUnitsByPatterns", "asas", "a(ssssssouso)", method_list_units_by_patterns, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListUnitsByNames", "as", "a(ssssssouso)", method_list_units_by_names, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListJobs", NULL, "a(usssoo)", method_list_jobs, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Subscribe", NULL, NULL, method_subscribe, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unsubscribe", NULL, NULL, method_unsubscribe, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Dump", NULL, "s", method_dump, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("DumpByFileDescriptor", NULL, "h", method_dump_by_fd, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("CreateSnapshot", "sb", "o", method_refuse_snapshot, SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_METHOD("RemoveSnapshot", "s", NULL, method_refuse_snapshot, SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_METHOD("Reload", NULL, NULL, method_reload, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Reexecute", NULL, NULL, method_reexecute, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Exit", NULL, NULL, method_exit, 0), + SD_BUS_METHOD("Reboot", NULL, NULL, method_reboot, SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("PowerOff", NULL, NULL, method_poweroff, SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("Halt", NULL, NULL, method_halt, SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("KExec", NULL, NULL, method_kexec, SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("SwitchRoot", "ss", NULL, method_switch_root, SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("SetEnvironment", "as", NULL, method_set_environment, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("UnsetEnvironment", "as", NULL, method_unset_environment, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("UnsetAndSetEnvironment", "asas", NULL, method_unset_and_set_environment, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListUnitFiles", NULL, "a(ss)", method_list_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ListUnitFilesByPatterns", "asas", "a(ss)", method_list_unit_files_by_patterns, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitFileState", "s", "s", method_get_unit_file_state, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("EnableUnitFiles", "asbb", "ba(sss)", method_enable_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("DisableUnitFiles", "asb", "a(sss)", method_disable_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReenableUnitFiles", "asbb", "ba(sss)", method_reenable_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("LinkUnitFiles", "asbb", "a(sss)", method_link_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("PresetUnitFiles", "asbb", "ba(sss)", method_preset_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("PresetUnitFilesWithMode", "assbb", "ba(sss)", method_preset_unit_files_with_mode, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("MaskUnitFiles", "asbb", "a(sss)", method_mask_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("UnmaskUnitFiles", "asb", "a(sss)", method_unmask_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("RevertUnitFiles", "as", "a(sss)", method_revert_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("SetDefaultTarget", "sb", "a(sss)", method_set_default_target, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetDefaultTarget", NULL, "s", method_get_default_target, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("PresetAllUnitFiles", "sbb", "a(sss)", method_preset_all_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("AddDependencyUnitFiles", "asssbb", "a(sss)", method_add_dependency_unit_files, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitFileLinks", "sb", "as", method_get_unit_file_links, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("SetExitCode", "y", NULL, method_set_exit_code, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("LookupDynamicUserByName", "s", "u", method_lookup_dynamic_user_by_name, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("LookupDynamicUserByUID", "u", "s", method_lookup_dynamic_user_by_uid, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetDynamicUsers", NULL, "a(us)", method_get_dynamic_users, SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_SIGNAL("UnitNew", "so", 0), + SD_BUS_SIGNAL("UnitRemoved", "so", 0), + SD_BUS_SIGNAL("JobNew", "uos", 0), + SD_BUS_SIGNAL("JobRemoved", "uoss", 0), + SD_BUS_SIGNAL("StartupFinished", "tttttt", 0), + SD_BUS_SIGNAL("UnitFilesChanged", NULL, 0), + SD_BUS_SIGNAL("Reloading", "b", 0), + + SD_BUS_VTABLE_END +}; + +static int send_finished(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + usec_t *times = userdata; + int r; + + assert(bus); + assert(times); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "StartupFinished"); + if (r < 0) + return r; + + r = sd_bus_message_append(message, "tttttt", times[0], times[1], times[2], times[3], times[4], times[5]); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +void bus_manager_send_finished( + Manager *m, + usec_t firmware_usec, + usec_t loader_usec, + usec_t kernel_usec, + usec_t initrd_usec, + usec_t userspace_usec, + usec_t total_usec) { + + int r; + + assert(m); + + r = bus_foreach_bus( + m, + NULL, + send_finished, + (usec_t[6]) { + firmware_usec, + loader_usec, + kernel_usec, + initrd_usec, + userspace_usec, + total_usec + }); + if (r < 0) + log_debug_errno(r, "Failed to send finished signal: %m"); +} + +static int send_reloading(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "Reloading"); + if (r < 0) + return r; + + r = sd_bus_message_append(message, "b", PTR_TO_INT(userdata)); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +void bus_manager_send_reloading(Manager *m, bool active) { + int r; + + assert(m); + + r = bus_foreach_bus(m, NULL, send_reloading, INT_TO_PTR(active)); + if (r < 0) + log_debug_errno(r, "Failed to send reloading signal: %m"); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + assert(bus); + + return sd_bus_emit_properties_changed_strv(bus, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + NULL); +} + +void bus_manager_send_change_signal(Manager *m) { + int r; + + assert(m); + + r = bus_foreach_bus(m, NULL, send_changed_signal, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send manager change signal: %m"); +} diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h new file mode 100644 index 0000000..d0306ad --- /dev/null +++ b/src/core/dbus-manager.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus-vtable.h" + +#include "manager.h" + +extern const sd_bus_vtable bus_manager_vtable[]; + +void bus_manager_send_finished(Manager *m, usec_t firmware_usec, usec_t loader_usec, usec_t kernel_usec, usec_t initrd_usec, usec_t userspace_usec, usec_t total_usec); +void bus_manager_send_reloading(Manager *m, bool active); +void bus_manager_send_change_signal(Manager *m); + +int verify_run_space_and_log(const char *message); diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c new file mode 100644 index 0000000..b6d6162 --- /dev/null +++ b/src/core/dbus-mount.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "bus-util.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-mount.h" +#include "dbus-util.h" +#include "mount.h" +#include "string-util.h" +#include "unit.h" + +static const char *mount_get_what(const Mount *m) { + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.what) + return m->parameters_proc_self_mountinfo.what; + if (m->from_fragment && m->parameters_fragment.what) + return m->parameters_fragment.what; + return NULL; +} + +static const char *mount_get_options(const Mount *m) { + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.options) + return m->parameters_proc_self_mountinfo.options; + if (m->from_fragment && m->parameters_fragment.options) + return m->parameters_fragment.options; + return NULL; +} + +static const char *mount_get_fstype(const Mount *m) { + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.fstype) + return m->parameters_proc_self_mountinfo.fstype; + else if (m->from_fragment && m->parameters_fragment.fstype) + return m->parameters_fragment.fstype; + return NULL; +} + +static BUS_DEFINE_PROPERTY_GET(property_get_what, "s", Mount, mount_get_what); +static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Mount, mount_get_options); +static BUS_DEFINE_PROPERTY_GET(property_get_type, "s", Mount, mount_get_fstype); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, mount_result, MountResult); + +const sd_bus_vtable bus_mount_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LazyUnmount", "b", bus_property_get_bool, offsetof(Mount, lazy_unmount), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ForceUnmount", "b", bus_property_get_bool, offsetof(Mount, force_unmount), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static int bus_mount_set_transient_property( + Mount *m, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(m); + + assert(m); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Where")) + return bus_set_transient_path(u, name, &m->where, message, flags, error); + + if (streq(name, "What")) + return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error); + + if (streq(name, "Options")) + return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error); + + if (streq(name, "Type")) + return bus_set_transient_string(u, name, &m->parameters_fragment.fstype, message, flags, error); + + if (streq(name, "TimeoutUSec")) + return bus_set_transient_usec_fix_0(u, name, &m->timeout_usec, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &m->directory_mode, message, flags, error); + + if (streq(name, "SloppyOptions")) + return bus_set_transient_bool(u, name, &m->sloppy_options, message, flags, error); + + if (streq(name, "LazyUnmount")) + return bus_set_transient_bool(u, name, &m->lazy_unmount, message, flags, error); + + if (streq(name, "ForceUnmount")) + return bus_set_transient_bool(u, name, &m->force_unmount, message, flags, error); + + return 0; +} + +int bus_mount_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &m->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_mount_set_transient_property(m, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &m->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &m->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_mount_commit_properties(Unit *u) { + assert(u); + + unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-mount.h b/src/core/dbus-mount.h new file mode 100644 index 0000000..f7112a9 --- /dev/null +++ b/src/core/dbus-mount.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_mount_vtable[]; + +int bus_mount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_mount_commit_properties(Unit *u); diff --git a/src/core/dbus-path.c b/src/core/dbus-path.c new file mode 100644 index 0000000..1a97d62 --- /dev/null +++ b/src/core/dbus-path.c @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "alloc-util.h" +#include "bus-util.h" +#include "dbus-path.h" +#include "dbus-util.h" +#include "list.h" +#include "path.h" +#include "path-util.h" +#include "string-util.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, path_result, PathResult); + +static int property_get_paths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Path *p = userdata; + PathSpec *k; + int r; + + assert(bus); + assert(reply); + assert(p); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(spec, k, p->specs) { + r = sd_bus_message_append(reply, "(ss)", path_type_to_string(k->type), k->path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_path_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Paths", "a(ss)", property_get_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MakeDirectory", "b", bus_property_get_bool, offsetof(Path, make_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Path, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Path, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; + +static int bus_path_set_transient_property( + Path *p, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(p); + int r; + + assert(p); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "MakeDirectory")) + return bus_set_transient_bool(u, name, &p->make_directory, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &p->directory_mode, message, flags, error); + + if (streq(name, "Paths")) { + const char *type_name, *path; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &type_name, &path)) > 0) { + PathType t; + + t = path_type_from_string(type_name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown path type: %s", type_name); + + if (isempty(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is empty", type_name); + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is not absolute: %s", type_name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *k; + PathSpec *s; + + k = strdup(path); + if (!k) + return -ENOMEM; + + path_simplify(k, false); + + s = new0(PathSpec, 1); + if (!s) + return -ENOMEM; + + s->unit = u; + s->path = TAKE_PTR(k); + s->type = t; + s->inotify_fd = -1; + + LIST_PREPEND(spec, p->specs, s); + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", type_name, path); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + path_free_specs(p); + unit_write_settingf(u, flags, name, "PathExists="); + } + + return 1; + } + + return 0; +} + +int bus_path_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags mode, + sd_bus_error *error) { + + Path *p = PATH(u); + + assert(p); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) + return bus_path_set_transient_property(p, name, message, mode, error); + + return 0; +} diff --git a/src/core/dbus-path.h b/src/core/dbus-path.h new file mode 100644 index 0000000..ad42b23 --- /dev/null +++ b/src/core/dbus-path.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_path_vtable[]; + +int bus_path_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c new file mode 100644 index 0000000..bb807df --- /dev/null +++ b/src/core/dbus-scope.c @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "dbus-cgroup.h" +#include "dbus-kill.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "scope.h" +#include "selinux-access.h" +#include "unit.h" + +int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Scope *s = userdata; + int r; + + assert(message); + assert(s); + + r = mac_selinux_unit_access_check(UNIT(s), message, "stop", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(UNIT(s)->manager, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = scope_abandon(s); + if (r == -ESTALE) + return sd_bus_error_setf(error, BUS_ERROR_SCOPE_NOT_RUNNING, "Scope %s is not running, cannot abandon.", UNIT(s)->id); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult); + +const sd_bus_vtable bus_scope_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Controller", "s", NULL, offsetof(Scope, controller), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_SIGNAL("RequestStop", NULL, 0), + SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_method_abandon, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; + +static int bus_scope_set_transient_property( + Scope *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "TimeoutStopUSec")) + return bus_set_transient_usec(UNIT(s), name, &s->timeout_stop_usec, message, flags, error); + + if (streq(name, "PIDs")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + + for (;;) { + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + if (!creds) { + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + } + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + r = unit_pid_attachable(UNIT(s), pid, error); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_watch_pid(UNIT(s), pid); + if (r < 0 && r != -EEXIST) + return r; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (n <= 0) + return -EINVAL; + + return 1; + + } else if (streq(name, "Controller")) { + const char *controller; + + /* We can't support direct connections with this, as direct connections know no service or unique name + * concept, but the Controller field stores exactly that. */ + if (sd_bus_message_get_bus(message) != UNIT(s)->manager->api_bus) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Sorry, Controller= logic only supported via the bus."); + + r = sd_bus_message_read(message, "s", &controller); + if (r < 0) + return r; + + if (!isempty(controller) && !service_name_is_valid(controller)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Controller '%s' is not a valid bus name.", controller); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = free_and_strdup(&s->controller, empty_to_null(controller)); + if (r < 0) + return r; + } + + return 1; + } + + return 0; +} + +int bus_scope_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->load_state == UNIT_STUB) { + /* While we are created we still accept PIDs */ + + r = bus_scope_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_scope_commit_properties(Unit *u) { + assert(u); + + unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +} + +int bus_scope_send_request_stop(Scope *s) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(s); + + if (!s->controller) + return 0; + + p = unit_dbus_path(UNIT(s)); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + UNIT(s)->manager->api_bus, + &m, + p, + "org.freedesktop.systemd1.Scope", + "RequestStop"); + if (r < 0) + return r; + + return sd_bus_send_to(UNIT(s)->manager->api_bus, m, s->controller, NULL); +} + +static int on_controller_gone(sd_bus_track *track, void *userdata) { + Scope *s = userdata; + + assert(track); + + if (s->controller) { + log_unit_debug(UNIT(s), "Controller %s disappeared from bus.", s->controller); + unit_add_to_dbus_queue(UNIT(s)); + s->controller = mfree(s->controller); + } + + s->controller_track = sd_bus_track_unref(s->controller_track); + + return 0; +} + +int bus_scope_track_controller(Scope *s) { + int r; + + assert(s); + + if (!s->controller || s->controller_track) + return 0; + + r = sd_bus_track_new(UNIT(s)->manager->api_bus, &s->controller_track, on_controller_gone, s); + if (r < 0) + return r; + + r = sd_bus_track_add_name(s->controller_track, s->controller); + if (r < 0) { + s->controller_track = sd_bus_track_unref(s->controller_track); + return r; + } + + return 0; +} diff --git a/src/core/dbus-scope.h b/src/core/dbus-scope.h new file mode 100644 index 0000000..702f558 --- /dev/null +++ b/src/core/dbus-scope.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "scope.h" +#include "unit.h" + +extern const sd_bus_vtable bus_scope_vtable[]; + +int bus_scope_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_scope_commit_properties(Unit *u); + +int bus_scope_send_request_stop(Scope *s); + +int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_scope_track_controller(Scope *s); diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c new file mode 100644 index 0000000..0904cc0 --- /dev/null +++ b/src/core/dbus-service.c @@ -0,0 +1,437 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <stdio_ext.h> + +#include "alloc-util.h" +#include "async.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-service.h" +#include "dbus-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "parse-util.h" +#include "path-util.h" +#include "service.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, service_type, ServiceType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, service_result, ServiceResult); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, ServiceRestart); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_notify_access, notify_access, NotifyAccess); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction); + +static int property_get_exit_status_set( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExitStatusSet *status_set = userdata; + Iterator i; + void *id; + int r; + + assert(bus); + assert(reply); + assert(status_set); + + r = sd_bus_message_open_container(reply, 'r', "aiai"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "i"); + if (r < 0) + return r; + + SET_FOREACH(id, status_set->status, i) { + int val = PTR_TO_INT(id); + + if (val < 0 || val > 255) + continue; + + r = sd_bus_message_append_basic(reply, 'i', &val); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "i"); + if (r < 0) + return r; + + SET_FOREACH(id, status_set->signal, i) { + int val = PTR_TO_INT(id); + const char *str; + + str = signal_to_string(val); + if (!str) + continue; + + r = sd_bus_message_append_basic(reply, 'i', &val); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_service_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Restart", "s", property_get_restart, offsetof(Service, restart), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PIDFile", "s", NULL, offsetof(Service, pid_file), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NotifyAccess", "s", property_get_notify_access, offsetof(Service, notify_access), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogUSec", "t", bus_property_get_usec, offsetof(Service, watchdog_usec), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0), + SD_BUS_PROPERTY("PermissionsStartOnly", "b", bus_property_get_bool, offsetof(Service, permissions_start_only), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* 😷 deprecated */ + SD_BUS_PROPERTY("RootDirectoryStartOnly", "b", bus_property_get_bool, offsetof(Service, root_directory_start_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemainAfterExit", "b", bus_property_get_bool, offsetof(Service, remain_after_exit), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("GuessMainPID", "b", bus_property_get_bool, offsetof(Service, guess_main_pid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartPreventExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_prevent_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartForceExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_force_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, success_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NFileDescriptorStore", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store), 0), + SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("StatusErrno", "i", bus_property_get_int, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecReload", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStop", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + + /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */ + SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_VTABLE_END +}; + +static int bus_set_transient_exit_status( + Unit *u, + const char *name, + ExitStatusSet *status_set, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const int *status, *signal; + size_t sz_status, sz_signal, i; + int r; + + r = sd_bus_message_enter_container(message, 'r', "aiai"); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'i', (const void **) &status, &sz_status); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'i', (const void **) &signal, &sz_signal); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (sz_status == 0 && sz_signal == 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) { + exit_status_set_free(status_set); + unit_write_settingf(u, flags, name, "%s=", name); + return 1; + } + + for (i = 0; i < sz_status; i++) { + if (status[i] < 0 || status[i] > 255) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid status code in %s: %i", name, status[i]); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = set_ensure_allocated(&status_set->status, NULL); + if (r < 0) + return r; + + r = set_put(status_set->status, INT_TO_PTR(status[i])); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%i", name, status[i]); + } + } + + for (i = 0; i < sz_signal; i++) { + const char *str; + + str = signal_to_string(signal[i]); + if (!str) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal in %s: %i", name, signal[i]); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = set_ensure_allocated(&status_set->signal, NULL); + if (r < 0) + return r; + + r = set_put(status_set->signal, INT_TO_PTR(signal[i])); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, str); + } + } + + return 1; +} + +static int bus_set_transient_std_fd( + Unit *u, + const char *name, + int *p, + bool *b, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int fd, r; + + assert(p); + assert(b); + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + asynchronous_close(*p); + *p = copy; + *b = true; + } + + return 1; +} +static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, service_name_is_valid); + +static int bus_service_set_transient_property( + Service *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(s); + ServiceExecCommand ci; + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "PermissionsStartOnly")) + return bus_set_transient_bool(u, name, &s->permissions_start_only, message, flags, error); + + if (streq(name, "RootDirectoryStartOnly")) + return bus_set_transient_bool(u, name, &s->root_directory_start_only, message, flags, error); + + if (streq(name, "RemainAfterExit")) + return bus_set_transient_bool(u, name, &s->remain_after_exit, message, flags, error); + + if (streq(name, "GuessMainPID")) + return bus_set_transient_bool(u, name, &s->guess_main_pid, message, flags, error); + + if (streq(name, "Type")) + return bus_set_transient_service_type(u, name, &s->type, message, flags, error); + + if (streq(name, "RestartUSec")) + return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error); + + if (streq(name, "TimeoutStartUSec")) { + r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + s->start_timeout_defined = true; + + return r; + } + + if (streq(name, "TimeoutStopUSec")) + return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error); + + if (streq(name, "RuntimeMaxUSec")) + return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error); + + if (streq(name, "WatchdogUSec")) + return bus_set_transient_usec(u, name, &s->watchdog_usec, message, flags, error); + + if (streq(name, "FileDescriptorStoreMax")) + return bus_set_transient_unsigned(u, name, &s->n_fd_store_max, message, flags, error); + + if (streq(name, "NotifyAccess")) + return bus_set_transient_notify_access(u, name, &s->notify_access, message, flags, error); + + if (streq(name, "PIDFile")) { + _cleanup_free_ char *n = NULL; + const char *v, *e; + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!isempty(v)) { + n = path_make_absolute(v, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!n) + return -ENOMEM; + + path_simplify(n, true); + + if (!path_is_normalized(n)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PIDFile= path '%s' is not valid", n); + + e = path_startswith(n, "/var/run/"); + if (e) { + char *z; + + z = strjoin("/run/", e); + if (!z) + return log_oom(); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + log_unit_notice(u, "Transient unit's PIDFile= property references path below legacy directory /var/run, updating %s → %s; please update client accordingly.", n, z); + + free_and_replace(n, z); + } + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + free_and_replace(s->pid_file, n); + unit_write_settingf(u, flags, name, "%s=%s", name, strempty(s->pid_file)); + } + + return 1; + } + + if (streq(name, "USBFunctionDescriptors")) + return bus_set_transient_path(u, name, &s->usb_function_descriptors, message, flags, error); + + if (streq(name, "USBFunctionStrings")) + return bus_set_transient_path(u, name, &s->usb_function_strings, message, flags, error); + + if (streq(name, "BusName")) + return bus_set_transient_bus_name(u, name, &s->bus_name, message, flags, error); + + if (streq(name, "Restart")) + return bus_set_transient_service_restart(u, name, &s->restart, message, flags, error); + + if (streq(name, "RestartPreventExitStatus")) + return bus_set_transient_exit_status(u, name, &s->restart_prevent_status, message, flags, error); + + if (streq(name, "RestartForceExitStatus")) + return bus_set_transient_exit_status(u, name, &s->restart_force_status, message, flags, error); + + if (streq(name, "SuccessExitStatus")) + return bus_set_transient_exit_status(u, name, &s->success_status, message, flags, error); + + ci = service_exec_command_from_string(name); + if (ci >= 0) + return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error); + + if (streq(name, "StandardInputFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stdin_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "StandardOutputFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stdout_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "StandardErrorFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stderr_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + return 0; +} + +int bus_service_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Service *s = SERVICE(u); + int r; + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_service_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_service_commit_properties(Unit *u) { + assert(u); + + unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h new file mode 100644 index 0000000..22d2b88 --- /dev/null +++ b/src/core/dbus-service.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_service_vtable[]; + +int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_service_commit_properties(Unit *u); diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c new file mode 100644 index 0000000..effd5fa --- /dev/null +++ b/src/core/dbus-slice.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "dbus-cgroup.h" +#include "dbus-slice.h" +#include "slice.h" +#include "unit.h" + +const sd_bus_vtable bus_slice_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_VTABLE_END +}; + +int bus_slice_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Slice *s = SLICE(u); + + assert(name); + assert(u); + + return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); +} + +int bus_slice_commit_properties(Unit *u) { + assert(u); + + unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-slice.h b/src/core/dbus-slice.h new file mode 100644 index 0000000..88cc48c --- /dev/null +++ b/src/core/dbus-slice.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_slice_vtable[]; + +int bus_slice_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_slice_commit_properties(Unit *u); diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c new file mode 100644 index 0000000..83ac7ad --- /dev/null +++ b/src/core/dbus-socket.c @@ -0,0 +1,468 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "alloc-util.h" +#include "bus-util.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-socket.h" +#include "dbus-util.h" +#include "fd-util.h" +#include "ip-protocol-list.h" +#include "parse-util.h" +#include "path-util.h" +#include "socket.h" +#include "socket-util.h" +#include "string-util.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, socket_result, SocketResult); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_bind_ipv6_only, socket_address_bind_ipv6_only, SocketAddressBindIPv6Only); +static BUS_DEFINE_PROPERTY_GET(property_get_fdname, "s", Socket, socket_fdname); + +static int property_get_listen( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Socket *s = SOCKET(userdata); + SocketPort *p; + int r; + + assert(bus); + assert(reply); + assert(s); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(port, p, s->ports) { + _cleanup_free_ char *address = NULL; + const char *a; + + switch (p->type) { + case SOCKET_SOCKET: { + r = socket_address_print(&p->address, &address); + if (r) + return r; + + a = address; + break; + } + + case SOCKET_SPECIAL: + case SOCKET_MQUEUE: + case SOCKET_FIFO: + case SOCKET_USB_FUNCTION: + a = p->path; + break; + + default: + assert_not_reached("Unknown socket type"); + } + + r = sd_bus_message_append(reply, "(ss)", socket_port_type_to_string(p), a); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_socket_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("BindIPv6Only", "s", property_get_bind_ipv6_only, offsetof(Socket, bind_ipv6_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Backlog", "u", bus_property_get_unsigned, offsetof(Socket, backlog), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Socket, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindToDevice", "s", NULL, offsetof(Socket, bind_to_device), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketUser", "s", NULL, offsetof(Socket, user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketGroup", "s", NULL, offsetof(Socket, group), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketMode", "u", bus_property_get_mode, offsetof(Socket, socket_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Socket, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Accept", "b", bus_property_get_bool, offsetof(Socket, accept), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Writable", "b", bus_property_get_bool, offsetof(Socket, writable), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAlive", "b", bus_property_get_bool, offsetof(Socket, keep_alive), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveTimeUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_time), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveProbes", "u", bus_property_get_unsigned, offsetof(Socket, keep_alive_cnt), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DeferAcceptUSec" , "t", bus_property_get_usec, offsetof(Socket, defer_accept), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoDelay", "b", bus_property_get_bool, offsetof(Socket, no_delay), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Priority", "i", bus_property_get_int, offsetof(Socket, priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReceiveBuffer", "t", bus_property_get_size, offsetof(Socket, receive_buffer), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendBuffer", "t", bus_property_get_size, offsetof(Socket, send_buffer), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPTOS", "i", bus_property_get_int, offsetof(Socket, ip_tos), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPTTL", "i", bus_property_get_int, offsetof(Socket, ip_ttl), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PipeSize", "t", bus_property_get_size, offsetof(Socket, pipe_size), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FreeBind", "b", bus_property_get_bool, offsetof(Socket, free_bind), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveOnStop", "b", bus_property_get_bool, offsetof(Socket, remove_on_stop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Listen", "a(ss)", property_get_listen, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Symlinks", "as", NULL, offsetof(Socket, symlinks), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Mark", "i", bus_property_get_int, offsetof(Socket, mark), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MaxConnections", "u", bus_property_get_unsigned, offsetof(Socket, max_connections), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MaxConnectionsPerSource", "u", bus_property_get_unsigned, offsetof(Socket, max_connections_per_source), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MessageQueueMaxMessages", "x", bus_property_get_long, offsetof(Socket, mq_maxmsg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MessageQueueMessageSize", "x", bus_property_get_long, offsetof(Socket, mq_msgsize), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TCPCongestion", "s", NULL, offsetof(Socket, tcp_congestion), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReusePort", "b", bus_property_get_bool, offsetof(Socket, reuse_port), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabel", "s", NULL, offsetof(Socket, smack), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabelIPIn", "s", NULL, offsetof(Socket, smack_ip_in), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabelIPOut", "s", NULL, offsetof(Socket, smack_ip_out), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Socket, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Socket, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NConnections", "u", bus_property_get_unsigned, offsetof(Socket, n_connections), 0), + SD_BUS_PROPERTY("NAccepted", "u", bus_property_get_unsigned, offsetof(Socket, n_accepted), 0), + SD_BUS_PROPERTY("NRefused", "u", bus_property_get_unsigned, offsetof(Socket, n_refused), 0), + SD_BUS_PROPERTY("FileDescriptorName", "s", property_get_fdname, 0, 0), + SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static bool check_size_t_truncation(uint64_t t) { + return (size_t) t == t; +} + +static const char* socket_protocol_to_string(int32_t i) { + if (i == IPPROTO_IP) + return ""; + + if (!IN_SET(i, IPPROTO_UDPLITE, IPPROTO_SCTP)) + return NULL; + + return ip_protocol_to_name(i); +} + +static BUS_DEFINE_SET_TRANSIENT(int, "i", int32_t, int, "%" PRIi32); +static BUS_DEFINE_SET_TRANSIENT(message_queue, "x", int64_t, long, "%" PRIi64); +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(size_t_check_truncation, "t", uint64_t, size_t, "%" PRIu64, check_size_t_truncation); +static BUS_DEFINE_SET_TRANSIENT_PARSE(bind_ipv6_only, SocketAddressBindIPv6Only, socket_address_bind_ipv6_only_or_bool_from_string); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(fdname, fdname_is_valid); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(ifname, ifname_valid); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(ip_tos, "i", int32_t, int, "%" PRIi32, ip_tos_to_string_alloc); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(socket_protocol, "i", int32_t, int, "%" PRIi32, socket_protocol_to_string); + +static int bus_socket_set_transient_property( + Socket *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + SocketExecCommand ci; + Unit *u = UNIT(s); + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Accept")) + return bus_set_transient_bool(u, name, &s->accept, message, flags, error); + + if (streq(name, "Writable")) + return bus_set_transient_bool(u, name, &s->writable, message, flags, error); + + if (streq(name, "KeepAlive")) + return bus_set_transient_bool(u, name, &s->keep_alive, message, flags, error); + + if (streq(name, "NoDelay")) + return bus_set_transient_bool(u, name, &s->no_delay, message, flags, error); + + if (streq(name, "FreeBind")) + return bus_set_transient_bool(u, name, &s->free_bind, message, flags, error); + + if (streq(name, "Transparent")) + return bus_set_transient_bool(u, name, &s->transparent, message, flags, error); + + if (streq(name, "Broadcast")) + return bus_set_transient_bool(u, name, &s->broadcast, message, flags, error); + + if (streq(name, "PassCredentials")) + return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error); + + if (streq(name, "PassSecurity")) + return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error); + + if (streq(name, "ReusePort")) + return bus_set_transient_bool(u, name, &s->reuse_port, message, flags, error); + + if (streq(name, "RemoveOnStop")) + return bus_set_transient_bool(u, name, &s->remove_on_stop, message, flags, error); + + if (streq(name, "SELinuxContextFromNet")) + return bus_set_transient_bool(u, name, &s->selinux_context_from_net, message, flags, error); + + if (streq(name, "Priority")) + return bus_set_transient_int(u, name, &s->priority, message, flags, error); + + if (streq(name, "IPTTL")) + return bus_set_transient_int(u, name, &s->ip_ttl, message, flags, error); + + if (streq(name, "Mark")) + return bus_set_transient_int(u, name, &s->mark, message, flags, error); + + if (streq(name, "Backlog")) + return bus_set_transient_unsigned(u, name, &s->backlog, message, flags, error); + + if (streq(name, "MaxConnections")) + return bus_set_transient_unsigned(u, name, &s->max_connections, message, flags, error); + + if (streq(name, "MaxConnectionsPerSource")) + return bus_set_transient_unsigned(u, name, &s->max_connections_per_source, message, flags, error); + + if (streq(name, "KeepAliveProbes")) + return bus_set_transient_unsigned(u, name, &s->keep_alive_cnt, message, flags, error); + + if (streq(name, "TriggerLimitBurst")) + return bus_set_transient_unsigned(u, name, &s->trigger_limit.burst, message, flags, error); + + if (streq(name, "SocketMode")) + return bus_set_transient_mode_t(u, name, &s->socket_mode, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &s->directory_mode, message, flags, error); + + if (streq(name, "MessageQueueMaxMessages")) + return bus_set_transient_message_queue(u, name, &s->mq_maxmsg, message, flags, error); + + if (streq(name, "MessageQueueMessageSize")) + return bus_set_transient_message_queue(u, name, &s->mq_msgsize, message, flags, error); + + if (streq(name, "TimeoutUSec")) + return bus_set_transient_usec_fix_0(u, name, &s->timeout_usec, message, flags, error); + + if (streq(name, "KeepAliveTimeUSec")) + return bus_set_transient_usec(u, name, &s->keep_alive_time, message, flags, error); + + if (streq(name, "KeepAliveIntervalUSec")) + return bus_set_transient_usec(u, name, &s->keep_alive_interval, message, flags, error); + + if (streq(name, "DeferAcceptUSec")) + return bus_set_transient_usec(u, name, &s->defer_accept, message, flags, error); + + if (streq(name, "TriggerLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &s->trigger_limit.interval, message, flags, error); + + if (streq(name, "SmackLabel")) + return bus_set_transient_string(u, name, &s->smack, message, flags, error); + + if (streq(name, "SmackLabelIPin")) + return bus_set_transient_string(u, name, &s->smack_ip_in, message, flags, error); + + if (streq(name, "SmackLabelIPOut")) + return bus_set_transient_string(u, name, &s->smack_ip_out, message, flags, error); + + if (streq(name, "TCPCongestion")) + return bus_set_transient_string(u, name, &s->tcp_congestion, message, flags, error); + + if (streq(name, "FileDescriptorName")) + return bus_set_transient_fdname(u, name, &s->fdname, message, flags, error); + + if (streq(name, "SocketUser")) + return bus_set_transient_user(u, name, &s->user, message, flags, error); + + if (streq(name, "SocketGroup")) + return bus_set_transient_user(u, name, &s->group, message, flags, error); + + if (streq(name, "BindIPv6Only")) + return bus_set_transient_bind_ipv6_only(u, name, &s->bind_ipv6_only, message, flags, error); + + if (streq(name, "ReceiveBuffer")) + return bus_set_transient_size_t_check_truncation(u, name, &s->receive_buffer, message, flags, error); + + if (streq(name, "SendBuffer")) + return bus_set_transient_size_t_check_truncation(u, name, &s->send_buffer, message, flags, error); + + if (streq(name, "PipeSize")) + return bus_set_transient_size_t_check_truncation(u, name, &s->pipe_size, message, flags, error); + + if (streq(name, "BindToDevice")) + return bus_set_transient_ifname(u, name, &s->bind_to_device, message, flags, error); + + if (streq(name, "IPTOS")) + return bus_set_transient_ip_tos(u, name, &s->ip_tos, message, flags, error); + + if (streq(name, "SocketProtocol")) + return bus_set_transient_socket_protocol(u, name, &s->socket_protocol, message, flags, error); + + if ((ci = socket_exec_command_from_string(name)) >= 0) + return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error); + + if (streq(name, "Symlinks")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Symlink path is not absolute: %s", *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + s->symlinks = strv_free(s->symlinks); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&s->symlinks, l, true); + if (r < 0) + return -ENOMEM; + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (streq(name, "Listen")) { + const char *t, *a; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &t, &a)) > 0) { + _cleanup_free_ SocketPort *p = NULL; + + p = new(SocketPort, 1); + if (!p) + return log_oom(); + + *p = (SocketPort) { + .fd = -1, + .socket = s, + }; + + p->type = socket_port_type_from_string(t); + if (p->type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown Socket type: %s", t); + + if (p->type != SOCKET_SOCKET) { + if (!path_is_valid(p->path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid socket path: %s", t); + + p->path = strdup(a); + if (!p->path) + return log_oom(); + + path_simplify(p->path, false); + + } else if (streq(t, "Netlink")) { + r = socket_address_parse_netlink(&p->address, a); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid netlink address: %s", a); + + } else { + r = socket_address_parse(&p->address, a); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address: %s", a); + + p->address.type = socket_address_type_from_string(t); + if (p->address.type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address type: %s", t); + + if (socket_address_family(&p->address) != AF_LOCAL && p->address.type == SOCK_SEQPACKET) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Address family not supported: %s", a); + } + + empty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + LIST_APPEND(port, s->ports, TAKE_PTR(p)); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Listen%s=%s", t, a); + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + socket_free_ports(s); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ListenStream="); + } + + return 1; + } + + return 0; +} + +int bus_socket_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(name); + assert(message); + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_socket_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_socket_commit_properties(Unit *u) { + assert(u); + + unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-socket.h b/src/core/dbus-socket.h new file mode 100644 index 0000000..9aa8133 --- /dev/null +++ b/src/core/dbus-socket.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_socket_vtable[]; + +int bus_socket_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_socket_commit_properties(Unit *u); diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c new file mode 100644 index 0000000..353fa20 --- /dev/null +++ b/src/core/dbus-swap.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "bus-util.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-swap.h" +#include "string-util.h" +#include "swap.h" +#include "unit.h" + +static int swap_get_priority(Swap *s) { + if (s->from_proc_swaps) + return s->parameters_proc_swaps.priority; + if (s->from_fragment) + return s->parameters_fragment.priority; + return -1; +} + +static const char *swap_get_options(Swap *s) { + if (s->from_fragment) + return s->parameters_fragment.options; + return NULL; +} + +static BUS_DEFINE_PROPERTY_GET(property_get_priority, "i", Swap, swap_get_priority); +static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Swap, swap_get_options); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, swap_result, SwapResult); + +const sd_bus_vtable bus_swap_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("What", "s", NULL, offsetof(Swap, what), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Priority", "i", property_get_priority, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +int bus_swap_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Swap *s = SWAP(u); + + assert(s); + assert(name); + assert(message); + + return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); +} + +int bus_swap_commit_properties(Unit *u) { + assert(u); + + unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-swap.h b/src/core/dbus-swap.h new file mode 100644 index 0000000..b114fe0 --- /dev/null +++ b/src/core/dbus-swap.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_swap_vtable[]; + +int bus_swap_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_swap_commit_properties(Unit *u); diff --git a/src/core/dbus-target.c b/src/core/dbus-target.c new file mode 100644 index 0000000..ba50113 --- /dev/null +++ b/src/core/dbus-target.c @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "dbus-target.h" +#include "unit.h" + +const sd_bus_vtable bus_target_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_VTABLE_END +}; diff --git a/src/core/dbus-target.h b/src/core/dbus-target.h new file mode 100644 index 0000000..ad02a1d --- /dev/null +++ b/src/core/dbus-target.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus-vtable.h" + +extern const sd_bus_vtable bus_target_vtable[]; diff --git a/src/core/dbus-timer.c b/src/core/dbus-timer.c new file mode 100644 index 0000000..b9d2f3d --- /dev/null +++ b/src/core/dbus-timer.c @@ -0,0 +1,369 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "alloc-util.h" +#include "bus-util.h" +#include "dbus-timer.h" +#include "dbus-util.h" +#include "strv.h" +#include "timer.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, timer_result, TimerResult); + +static int property_get_monotonic_timers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = userdata; + TimerValue *v; + int r; + + assert(bus); + assert(reply); + assert(t); + + r = sd_bus_message_open_container(reply, 'a', "(stt)"); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + _cleanup_free_ char *buf = NULL; + const char *s; + size_t l; + + if (v->base == TIMER_CALENDAR) + continue; + + s = timer_base_to_string(v->base); + assert(endswith(s, "Sec")); + + /* s/Sec/USec/ */ + l = strlen(s); + buf = new(char, l+2); + if (!buf) + return -ENOMEM; + + memcpy(buf, s, l-3); + memcpy(buf+l-3, "USec", 5); + + r = sd_bus_message_append(reply, "(stt)", buf, v->value, v->next_elapse); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_calendar_timers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = userdata; + TimerValue *v; + int r; + + assert(bus); + assert(reply); + assert(t); + + r = sd_bus_message_open_container(reply, 'a', "(sst)"); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + _cleanup_free_ char *buf = NULL; + + if (v->base != TIMER_CALENDAR) + continue; + + r = calendar_spec_to_string(v->calendar_spec, &buf); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "(sst)", timer_base_to_string(v->base), buf, v->next_elapse); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_next_elapse_monotonic( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = userdata; + + assert(bus); + assert(reply); + assert(t); + + return sd_bus_message_append(reply, "t", + (uint64_t) usec_shift_clock(t->next_elapse_monotonic_or_boottime, + TIMER_MONOTONIC_CLOCK(t), CLOCK_MONOTONIC)); +} + +const sd_bus_vtable bus_timer_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimersMonotonic", "a(stt)", property_get_monotonic_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("TimersCalendar", "a(sst)", property_get_calendar_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("NextElapseUSecRealtime", "t", bus_property_get_usec, offsetof(Timer, next_elapse_realtime), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NextElapseUSecMonotonic", "t", property_get_next_elapse_monotonic, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("LastTriggerUSec", offsetof(Timer, last_trigger), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Timer, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AccuracyUSec", "t", bus_property_get_usec, offsetof(Timer, accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Timer, random_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Persistent", "b", bus_property_get_bool, offsetof(Timer, persistent), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WakeSystem", "b", bus_property_get_bool, offsetof(Timer, wake_system), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemainAfterElapse", "b", bus_property_get_bool, offsetof(Timer, remain_after_elapse), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int bus_timer_set_transient_property( + Timer *t, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(t); + int r; + + assert(t); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "AccuracyUSec")) + return bus_set_transient_usec(u, name, &t->accuracy_usec, message, flags, error); + + if (streq(name, "AccuracySec")) { + log_notice("Client is using obsolete AccuracySec= transient property, please use AccuracyUSec= instead."); + return bus_set_transient_usec(u, "AccuracyUSec", &t->accuracy_usec, message, flags, error); + } + + if (streq(name, "RandomizedDelayUSec")) + return bus_set_transient_usec(u, name, &t->random_usec, message, flags, error); + + if (streq(name, "WakeSystem")) + return bus_set_transient_bool(u, name, &t->wake_system, message, flags, error); + + if (streq(name, "Persistent")) + return bus_set_transient_bool(u, name, &t->persistent, message, flags, error); + + if (streq(name, "RemainAfterElapse")) + return bus_set_transient_bool(u, name, &t->remain_after_elapse, message, flags, error); + + if (streq(name, "TimersMonotonic")) { + const char *base_name; + usec_t usec = 0; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &base_name, &usec)) > 0) { + TimerBase b; + + b = timer_base_from_string(base_name); + if (b < 0 || b == TIMER_CALENDAR) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid timer base: %s", base_name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char ts[FORMAT_TIMESPAN_MAX]; + TimerValue *v; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", base_name, + format_timespan(ts, sizeof(ts), usec, USEC_PER_MSEC)); + + v = new0(TimerValue, 1); + if (!v) + return -ENOMEM; + + v->base = b; + v->value = usec; + + LIST_PREPEND(value, t->values, v); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + timer_free_values(t); + unit_write_setting(u, flags, name, "OnActiveSec="); + } + + return 1; + + } else if (streq(name, "TimersCalendar")) { + const char *base_name, *str; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &base_name, &str)) > 0) { + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + TimerBase b; + + b = timer_base_from_string(base_name); + if (b != TIMER_CALENDAR) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid timer base: %s", base_name); + + r = calendar_spec_from_string(str, &c); + if (r == -EINVAL) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec: %s", str); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + TimerValue *v; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", base_name, str); + + v = new0(TimerValue, 1); + if (!v) + return -ENOMEM; + + v->base = b; + v->calendar_spec = TAKE_PTR(c); + + LIST_PREPEND(value, t->values, v); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + timer_free_values(t); + unit_write_setting(u, flags, name, "OnCalendar="); + } + + return 1; + + } else if (STR_IN_SET(name, + "OnActiveSec", + "OnBootSec", + "OnStartupSec", + "OnUnitActiveSec", + "OnUnitInactiveSec")) { + + TimerValue *v; + TimerBase b = _TIMER_BASE_INVALID; + usec_t usec = 0; + + log_notice("Client is using obsolete %s= transient property, please use TimersMonotonic= instead.", name); + + b = timer_base_from_string(name); + if (b < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown timer base"); + + r = sd_bus_message_read(message, "t", &usec); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char time[FORMAT_TIMESPAN_MAX]; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, + format_timespan(time, sizeof(time), usec, USEC_PER_MSEC)); + + v = new0(TimerValue, 1); + if (!v) + return -ENOMEM; + + v->base = b; + v->value = usec; + + LIST_PREPEND(value, t->values, v); + } + + return 1; + + } else if (streq(name, "OnCalendar")) { + + TimerValue *v; + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + const char *str; + + log_notice("Client is using obsolete %s= transient property, please use TimersCalendar= instead.", name); + + r = sd_bus_message_read(message, "s", &str); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = calendar_spec_from_string(str, &c); + if (r == -EINVAL) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec"); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, str); + + v = new0(TimerValue, 1); + if (!v) + return -ENOMEM; + + v->base = TIMER_CALENDAR; + v->calendar_spec = TAKE_PTR(c); + + LIST_PREPEND(value, t->values, v); + } + + return 1; + } + + return 0; +} + +int bus_timer_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags mode, + sd_bus_error *error) { + + Timer *t = TIMER(u); + + assert(t); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) + return bus_timer_set_transient_property(t, name, message, mode, error); + + return 0; +} diff --git a/src/core/dbus-timer.h b/src/core/dbus-timer.h new file mode 100644 index 0000000..bb126b2 --- /dev/null +++ b/src/core/dbus-timer.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_timer_vtable[]; + +int bus_timer_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c new file mode 100644 index 0000000..17c2003 --- /dev/null +++ b/src/core/dbus-unit.c @@ -0,0 +1,1951 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-common-errors.h" +#include "cgroup-util.h" +#include "condition.h" +#include "dbus-job.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "fd-util.h" +#include "locale-util.h" +#include "log.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-access.h" +#include "signal-util.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" +#include "web-util.h" + +static bool unit_can_start_refuse_manual(Unit *u) { + return unit_can_start(u) && !u->refuse_manual_start; +} + +static bool unit_can_stop_refuse_manual(Unit *u) { + return unit_can_stop(u) && !u->refuse_manual_stop; +} + +static bool unit_can_isolate_refuse_manual(Unit *u) { + return unit_can_isolate(u) && !u->refuse_manual_start; +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_collect_mode, collect_mode, CollectMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_load_state, unit_load_state, UnitLoadState); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_job_mode, job_mode, JobMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction); +static BUS_DEFINE_PROPERTY_GET(property_get_description, "s", Unit, unit_description); +static BUS_DEFINE_PROPERTY_GET2(property_get_active_state, "s", Unit, unit_active_state, unit_active_state_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_sub_state, "s", Unit, unit_sub_state_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_unit_file_state, "s", Unit, unit_get_unit_file_state, unit_file_state_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_can_reload, "b", Unit, unit_can_reload); +static BUS_DEFINE_PROPERTY_GET(property_get_can_start, "b", Unit, unit_can_start_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_stop, "b", Unit, unit_can_stop_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_isolate, "b", Unit, unit_can_isolate_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_need_daemon_reload, "b", Unit, unit_need_daemon_reload); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_strv, "as", 0); + +static int property_get_names( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Set **s = userdata; + Iterator i; + const char *t; + int r; + + assert(bus); + assert(reply); + assert(s); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + SET_FOREACH(t, *s, i) { + r = sd_bus_message_append(reply, "s", t); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_following( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *f; + + assert(bus); + assert(reply); + assert(u); + + f = unit_following(u); + return sd_bus_message_append(reply, "s", f ? f->id : NULL); +} + +static int property_get_dependencies( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Hashmap **h = userdata; + Iterator j; + Unit *u; + void *v; + int r; + + assert(bus); + assert(reply); + assert(h); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(v, u, *h, j) { + r = sd_bus_message_append(reply, "s", u->id); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_requires_mounts_for( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Hashmap **h = userdata; + const char *p; + Iterator j; + void *v; + int r; + + assert(bus); + assert(reply); + assert(h); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(v, p, *h, j) { + r = sd_bus_message_append(reply, "s", p); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_unit_file_preset( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_unit_file_preset(u); + + return sd_bus_message_append(reply, "s", + r < 0 ? NULL: + r > 0 ? "enabled" : "disabled"); +} + +static int property_get_job( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Job **j = userdata; + + assert(bus); + assert(reply); + assert(j); + + if (!*j) + return sd_bus_message_append(reply, "(uo)", 0, "/"); + + p = job_dbus_path(*j); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(uo)", (*j)->id, p); +} + +static int property_get_conditions( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const char *(*to_string)(ConditionType type) = NULL; + Condition **list = userdata, *c; + int r; + + assert(bus); + assert(reply); + assert(list); + + to_string = streq(property, "Asserts") ? assert_type_to_string : condition_type_to_string; + + r = sd_bus_message_open_container(reply, 'a', "(sbbsi)"); + if (r < 0) + return r; + + LIST_FOREACH(conditions, c, *list) { + int tristate; + + tristate = + c->result == CONDITION_UNTESTED ? 0 : + c->result == CONDITION_SUCCEEDED ? 1 : -1; + + r = sd_bus_message_append(reply, "(sbbsi)", + to_string(c->type), + c->trigger, c->negate, + c->parameter, tristate); + if (r < 0) + return r; + + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_load_error( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = bus_unit_validate_load_state(u, &e); + if (r < 0) + return sd_bus_message_append(reply, "(ss)", e.name, e.message); + + return sd_bus_message_append(reply, "(ss)", NULL, NULL); +} + +static int bus_verify_manage_units_async_full( + Unit *u, + const char *verb, + int capability, + const char *polkit_message, + bool interactive, + sd_bus_message *call, + sd_bus_error *error) { + + const char *details[9] = { + "unit", u->id, + "verb", verb, + }; + + if (polkit_message) { + details[4] = "polkit.message"; + details[5] = polkit_message; + details[6] = "polkit.gettext_domain"; + details[7] = GETTEXT_PACKAGE; + } + + return bus_verify_polkit_async( + call, + capability, + "org.freedesktop.systemd1.manage-units", + details, + interactive, + UID_INVALID, + &u->manager->polkit_registry, + error); +} + +int bus_unit_method_start_generic( + sd_bus_message *message, + Unit *u, + JobType job_type, + bool reload_if_possible, + sd_bus_error *error) { + + const char *smode; + JobMode mode; + _cleanup_free_ char *verb = NULL; + static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = { + [JOB_START] = N_("Authentication is required to start '$(unit)'."), + [JOB_STOP] = N_("Authentication is required to stop '$(unit)'."), + [JOB_RELOAD] = N_("Authentication is required to reload '$(unit)'."), + [JOB_RESTART] = N_("Authentication is required to restart '$(unit)'."), + [JOB_TRY_RESTART] = N_("Authentication is required to restart '$(unit)'."), + }; + int r; + + assert(message); + assert(u); + assert(job_type >= 0 && job_type < _JOB_TYPE_MAX); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(job_type), + error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &smode); + if (r < 0) + return r; + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode); + + if (reload_if_possible) + verb = strjoin("reload-or-", job_type_to_string(job_type)); + else + verb = strdup(job_type_to_string(job_type)); + if (!verb) + return -ENOMEM; + + r = bus_verify_manage_units_async_full( + u, + verb, + CAP_SYS_ADMIN, + job_type < _JOB_TYPE_MAX ? polkit_message_for_job[job_type] : NULL, + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + return bus_unit_queue_job(message, u, job_type, mode, reload_if_possible, error); +} + +static int method_start(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_START, false, error); +} + +static int method_stop(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_STOP, false, error); +} + +static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RELOAD, false, error); +} + +static int method_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RESTART, false, error); +} + +static int method_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, false, error); +} + +static int method_reload_or_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RESTART, true, error); +} + +static int method_reload_or_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, true, error); +} + +int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + const char *swho; + int32_t signo; + KillWho who; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "stop", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "si", &swho, &signo); + if (r < 0) + return r; + + if (isempty(swho)) + who = KILL_ALL; + else { + who = kill_who_from_string(swho); + if (who < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid who argument %s", swho); + } + + if (!SIGNAL_VALID(signo)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Signal number out of range."); + + r = bus_verify_manage_units_async_full( + u, + "kill", + CAP_KILL, + N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_kill(u, who, signo, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "reset-failed", + CAP_SYS_ADMIN, + N_("Authentication is required to reset the \"failed\" state of '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + unit_reset_failed(u); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int runtime, r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "set-property", + CAP_SYS_ADMIN, + N_("Authentication is required to set properties on '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = bus_unit_set_properties(u, message, runtime ? UNIT_RUNTIME : UNIT_PERSISTENT, true, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "ref", + CAP_SYS_ADMIN, + NULL, + false, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = bus_unit_track_add_sender(u, message); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = bus_unit_track_remove_sender(u, message); + if (r == -EUNATCH) + return sd_bus_error_setf(error, BUS_ERROR_NOT_REFERENCED, "Unit has not been referenced yet."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int property_get_refs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + const char *i; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (i = sd_bus_track_first(u->bus_track); i; i = sd_bus_track_next(u->bus_track)) { + int c, k; + + c = sd_bus_track_count_name(u->bus_track, i); + if (c < 0) + return c; + + /* Add the item multiple times if the ref count for each is above 1 */ + for (k = 0; k < c; k++) { + r = sd_bus_message_append(reply, "s", i); + if (r < 0) + return r; + } + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_unit_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Unit, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Names", "as", property_get_names, offsetof(Unit, names), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Following", "s", property_get_following, 0, 0), + SD_BUS_PROPERTY("Requires", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUIRES]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Requisite", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUISITE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Wants", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_WANTS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindsTo", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_BINDS_TO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PartOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_PART_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiredBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUIRED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequisiteOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUISITE_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WantedBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_WANTED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BoundBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_BOUND_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConsistsOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_CONSISTS_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Conflicts", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_CONFLICTS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConflictedBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_CONFLICTED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Before", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_BEFORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("After", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_AFTER]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailure", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_ON_FAILURE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Triggers", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_TRIGGERS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggeredBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_TRIGGERED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PropagatesReloadTo", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_PROPAGATES_RELOAD_TO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReloadPropagatedFrom", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_RELOAD_PROPAGATED_FROM]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_JOINS_NAMESPACE_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LoadState", "s", property_get_load_state, offsetof(Unit, load_state), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ActiveState", "s", property_get_active_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("SubState", "s", property_get_sub_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FragmentPath", "s", NULL, offsetof(Unit, fragment_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Unit, source_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DropInPaths", "as", NULL, offsetof(Unit, dropin_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UnitFileState", "s", property_get_unit_file_state, 0, 0), + SD_BUS_PROPERTY("UnitFilePreset", "s", property_get_unit_file_preset, 0, 0), + BUS_PROPERTY_DUAL_TIMESTAMP("StateChangeTimestamp", offsetof(Unit, state_change_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("InactiveExitTimestamp", offsetof(Unit, inactive_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ActiveEnterTimestamp", offsetof(Unit, active_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ActiveExitTimestamp", offsetof(Unit, active_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("InactiveEnterTimestamp", offsetof(Unit, inactive_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CanStart", "b", property_get_can_start, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanStop", "b", property_get_can_stop, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanReload", "b", property_get_can_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanIsolate", "b", property_get_can_isolate, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Job", "(uo)", property_get_job, offsetof(Unit, job), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("StopWhenUnneeded", "b", bus_property_get_bool, offsetof(Unit, stop_when_unneeded), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RefuseManualStart", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_start), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IgnoreOnIsolate", "b", bus_property_get_bool, offsetof(Unit, ignore_on_isolate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NeedDaemonReload", "b", property_get_need_daemon_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_timeout), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobRunningTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_running_timeout), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutAction", "s", property_get_emergency_action, offsetof(Unit, job_timeout_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutRebootArgument", "s", NULL, offsetof(Unit, job_timeout_reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConditionResult", "b", bus_property_get_bool, offsetof(Unit, condition_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AssertResult", "b", bus_property_get_bool, offsetof(Unit, assert_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ConditionTimestamp", offsetof(Unit, condition_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("AssertTimestamp", offsetof(Unit, assert_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Conditions", "a(sbbsi)", property_get_conditions, offsetof(Unit, conditions), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("Asserts", "a(sbbsi)", property_get_conditions, offsetof(Unit, asserts), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("LoadError", "(ss)", property_get_load_error, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Transient", "b", bus_property_get_bool, offsetof(Unit, transient), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Perpetual", "b", bus_property_get_bool, offsetof(Unit, perpetual), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Unit, start_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FailureActionExitStatus", "i", bus_property_get_int, offsetof(Unit, failure_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessAction", "s", property_get_emergency_action, offsetof(Unit, success_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessActionExitStatus", "i", bus_property_get_int, offsetof(Unit, success_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0), + + SD_BUS_METHOD("Start", "s", "o", method_start, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Stop", "s", "o", method_stop, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Reload", "s", "o", method_reload, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Restart", "s", "o", method_restart, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("TryRestart", "s", "o", method_try_restart, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReloadOrRestart", "s", "o", method_reload_or_restart, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReloadOrTryRestart", "s", "o", method_reload_or_try_restart, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Kill", "si", NULL, bus_unit_method_kill, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailed", NULL, NULL, bus_unit_method_reset_failed, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("SetProperties", "ba(sv)", NULL, bus_unit_method_set_properties, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Ref", NULL, NULL, bus_unit_method_ref, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unref", NULL, NULL, bus_unit_method_unref, SD_BUS_VTABLE_UNPRIVILEGED), + + /* For dependency types we don't support anymore always return an empty array */ + SD_BUS_PROPERTY("RequiresOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequisiteOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequiredByOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequisiteOfOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + /* Obsolete alias names */ + SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Unit, start_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_VTABLE_END +}; + +static int property_get_slice( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + + assert(bus); + assert(reply); + assert(u); + + return sd_bus_message_append(reply, "s", unit_slice_name(u)); +} + +static int property_get_current_memory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t sz = (uint64_t) -1; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_memory_current(u, &sz); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get memory.usage_in_bytes attribute: %m"); + + return sd_bus_message_append(reply, "t", sz); +} + +static int property_get_current_tasks( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t cn = (uint64_t) -1; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_tasks_current(u, &cn); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get pids.current attribute: %m"); + + return sd_bus_message_append(reply, "t", cn); +} + +static int property_get_cpu_usage( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + nsec_t ns = (nsec_t) -1; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_cpu_usage(u, &ns); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get cpuacct.usage attribute: %m"); + + return sd_bus_message_append(reply, "t", ns); +} + +static int property_get_cgroup( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + const char *t = NULL; + + assert(bus); + assert(reply); + assert(u); + + /* Three cases: a) u->cgroup_path is NULL, in which case the + * unit has no control group, which we report as the empty + * string. b) u->cgroup_path is the empty string, which + * indicates the root cgroup, which we report as "/". c) all + * other cases we report as-is. */ + + if (u->cgroup_path) + t = empty_to_root(u->cgroup_path); + + return sd_bus_message_append(reply, "s", t); +} + +static int append_process(sd_bus_message *reply, const char *p, pid_t pid, Set *pids) { + _cleanup_free_ char *buf = NULL, *cmdline = NULL; + int r; + + assert(reply); + assert(pid > 0); + + r = set_put(pids, PID_TO_PTR(pid)); + if (IN_SET(r, 0, -EEXIST)) + return 0; + if (r < 0) + return r; + + if (!p) { + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &buf); + if (r == -ESRCH) + return 0; + if (r < 0) + return r; + + p = buf; + } + + (void) get_process_cmdline(pid, 0, true, &cmdline); + + return sd_bus_message_append(reply, + "(sus)", + p, + (uint32_t) pid, + cmdline); +} + +static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(reply); + assert(p); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, p, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + pid_t pid; + + r = cg_read_pid(f, &pid); + if (r < 0) + return r; + if (r == 0) + break; + + if (is_kernel_thread(pid) > 0) + continue; + + r = append_process(reply, p, pid, pids); + if (r < 0) + return r; + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, p, &d); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *g = NULL, *j = NULL; + + r = cg_read_subgroup(d, &g); + if (r < 0) + return r; + if (r == 0) + break; + + j = strjoin(p, "/", g); + if (!j) + return -ENOMEM; + + r = append_cgroup(reply, j, pids); + if (r < 0) + return r; + } + + return 0; +} + +int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_set_free_ Set *pids = NULL; + Unit *u = userdata; + pid_t pid; + int r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + pids = set_new(NULL); + if (!pids) + return -ENOMEM; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(sus)"); + if (r < 0) + return r; + + if (u->cgroup_path) { + r = append_cgroup(reply, u->cgroup_path, pids); + if (r < 0) + return r; + } + + /* The main and control pids might live outside of the cgroup, hence fetch them separately */ + pid = unit_main_pid(u); + if (pid > 0) { + r = append_process(reply, NULL, pid, pids); + if (r < 0) + return r; + } + + pid = unit_control_pid(u); + if (pid > 0) { + r = append_process(reply, NULL, pid, pids); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int property_get_ip_counter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupIPAccountingMetric metric; + uint64_t value = (uint64_t) -1; + Unit *u = userdata; + + assert(bus); + assert(reply); + assert(property); + assert(u); + + if (streq(property, "IPIngressBytes")) + metric = CGROUP_IP_INGRESS_BYTES; + else if (streq(property, "IPIngressPackets")) + metric = CGROUP_IP_INGRESS_PACKETS; + else if (streq(property, "IPEgressBytes")) + metric = CGROUP_IP_EGRESS_BYTES; + else { + assert(streq(property, "IPEgressPackets")); + metric = CGROUP_IP_EGRESS_PACKETS; + } + + (void) unit_get_ip_accounting(u, metric, &value); + return sd_bus_message_append(reply, "t", value); +} + +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_set_free_ Set *pids = NULL; + Unit *u = userdata; + const char *path; + int r; + + assert(message); + + /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a + * specified cgroup path. Obviously this only works for units that actually maintain a cgroup + * representation. If a process is already in the cgroup no operation is executed – in this case the specified + * subcgroup path has no effect! */ + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + path = empty_to_null(path); + if (path) { + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path); + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path); + } + + if (!unit_cgroup_delegate(u)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units."); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing."); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + for (;;) { + uid_t process_uid, sender_uid; + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + /* Filter out duplicates */ + if (set_contains(pids, PID_TO_PTR(pid))) + continue; + + /* Check if this process is suitable for attaching to this unit */ + r = unit_pid_attachable(u, pid, error); + if (r < 0) + return r; + + /* Let's query the sender's UID, so that we can make our security decisions */ + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + return r; + + /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit, + * then the process' UID and the target unit's UID have to match the sender's UID */ + if (sender_uid != 0 && sender_uid != getuid()) { + r = get_process_uid(pid, &process_uid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m"); + + if (process_uid != sender_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid); + if (process_uid != u->ref_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid); + } + + if (!pids) { + pids = set_new(NULL); + if (!pids) + return -ENOMEM; + } + + r = set_put(pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = unit_attach_pids_to_cgroup(u, pids, path); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m"); + + return sd_bus_reply_method_return(message, NULL); +} + +const sd_bus_vtable bus_unit_cgroup_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), + SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0), + SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0), + SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), + SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), + SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_METHOD("GetProcesses", NULL, "a(sus)", bus_unit_method_get_processes, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("AttachProcesses", "sau", NULL, bus_unit_method_attach_processes, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; + +static int send_new_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(u); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitNew"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "so", u->id, p); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + _cleanup_free_ char *p = NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(u); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + /* Send a properties changed signal. First for the specific + * type, then for the generic unit. The clients may rely on + * this order to get atomic behavior if needed. */ + + r = sd_bus_emit_properties_changed_strv( + bus, p, + unit_dbus_interface_from_type(u->type), + NULL); + if (r < 0) + return r; + + return sd_bus_emit_properties_changed_strv( + bus, p, + "org.freedesktop.systemd1.Unit", + NULL); +} + +void bus_unit_send_change_signal(Unit *u) { + int r; + assert(u); + + if (u->in_dbus_queue) { + LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u); + u->in_dbus_queue = false; + } + + if (!u->id) + return; + + r = bus_foreach_bus(u->manager, u->bus_track, u->sent_dbus_new_signal ? send_changed_signal : send_new_signal, u); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to send unit change signal for %s: %m", u->id); + + u->sent_dbus_new_signal = true; +} + +void bus_unit_send_pending_change_signal(Unit *u, bool including_new) { + + /* Sends out any pending change signals, but only if they really are pending. This call is used when we are + * about to change state in order to force out a PropertiesChanged signal beforehand if there was one pending + * so that clients can follow the full state transition */ + + if (!u->in_dbus_queue) /* If not enqueued, don't bother */ + return; + + if (!u->sent_dbus_new_signal && !including_new) /* If the unit was never announced, don't bother, it's fine if + * the unit appears in the new state right-away (except if the + * caller explicitly asked us to send it anyway) */ + return; + + if (MANAGER_IS_RELOADING(u->manager)) /* Don't generate unnecessary PropertiesChanged signals for the same unit + * when we are reloading. */ + return; + + bus_unit_send_change_signal(u); +} + +static int send_removed_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(u); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "so", u->id, p); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +void bus_unit_send_removed_signal(Unit *u) { + int r; + assert(u); + + if (!u->sent_dbus_new_signal || u->in_dbus_queue) + bus_unit_send_change_signal(u); + + if (!u->id) + return; + + r = bus_foreach_bus(u->manager, u->bus_track, send_removed_signal, u); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to send unit remove signal for %s: %m", u->id); +} + +int bus_unit_queue_job( + sd_bus_message *message, + Unit *u, + JobType type, + JobMode mode, + bool reload_if_possible, + sd_bus_error *error) { + + _cleanup_free_ char *path = NULL; + Job *j; + int r; + + assert(message); + assert(u); + assert(type >= 0 && type < _JOB_TYPE_MAX); + assert(mode >= 0 && mode < _JOB_MODE_MAX); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(type), + error); + if (r < 0) + return r; + + if (reload_if_possible && unit_can_reload(u)) { + if (type == JOB_RESTART) + type = JOB_RELOAD_OR_START; + else if (type == JOB_TRY_RESTART) + type = JOB_TRY_RELOAD; + } + + if (type == JOB_STOP && + IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_ERROR, UNIT_BAD_SETTING) && + unit_active_state(u) == UNIT_INACTIVE) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", u->id); + + if ((type == JOB_START && u->refuse_manual_start) || + (type == JOB_STOP && u->refuse_manual_stop) || + (IN_SET(type, JOB_RESTART, JOB_TRY_RESTART) && (u->refuse_manual_start || u->refuse_manual_stop)) || + (type == JOB_RELOAD_OR_START && job_type_collapse(type, u) == JOB_START && u->refuse_manual_start)) + return sd_bus_error_setf(error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, unit %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id); + + r = manager_add_job(u->manager, type, u, mode, error, &j); + if (r < 0) + return r; + + r = bus_job_track_sender(j, message); + if (r < 0) + return r; + + path = job_dbus_path(j); + if (!path) + return -ENOMEM; + + /* Before we send the method reply, force out the announcement JobNew for this job */ + bus_job_send_pending_change_signal(j, true); + + return sd_bus_reply_method_return(message, "o", path); +} + +static int bus_unit_set_live_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(u); + assert(name); + assert(message); + + /* Handles setting properties both "live" (i.e. at any time during runtime), and during creation (for transient + * units that are being created). */ + + if (streq(name, "Description")) { + const char *d; + + r = sd_bus_message_read(message, "s", &d); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_set_description(u, d); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Description=%s", d); + } + + return 1; + } + + return 0; +} + +static int bus_set_transient_emergency_action( + Unit *u, + const char *name, + EmergencyAction *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *s; + EmergencyAction v; + int r; + bool system; + + assert(p); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + system = MANAGER_IS_SYSTEM(u->manager); + r = parse_emergency_action(s, system, &v); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + r == -EOPNOTSUPP ? "%s setting invalid for manager type: %s" + : "Invalid %s setting: %s", + name, s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, + "%s=%s", name, s); + } + + return 1; +} + +static int bus_set_transient_exit_status( + Unit *u, + const char *name, + int *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int32_t k; + int r; + + assert(p); + + r = sd_bus_message_read(message, "i", &k); + if (r < 0) + return r; + + if (k > 255) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Exit status must be in range 0…255 or negative."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = k < 0 ? -1 : k; + + if (k < 0) + unit_write_settingf(u, flags, name, "%s=", name); + else + unit_write_settingf(u, flags, name, "%s=%i", name, k); + } + + return 1; +} + +static BUS_DEFINE_SET_TRANSIENT_PARSE(collect_mode, CollectMode, collect_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(job_mode, JobMode, job_mode_from_string); + +static int bus_set_transient_conditions( + Unit *u, + const char *name, + Condition **list, + bool is_condition, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *type_name, *param; + int trigger, negate, r; + bool empty = true; + + assert(list); + + r = sd_bus_message_enter_container(message, 'a', "(sbbs)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(sbbs)", &type_name, &trigger, &negate, ¶m)) > 0) { + ConditionType t; + + t = is_condition ? condition_type_from_string(type_name) : assert_type_from_string(type_name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid condition type: %s", type_name); + + if (t != CONDITION_NULL) { + if (isempty(param)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Condition parameter in %s is empty", type_name); + + if (condition_takes_path(t) && !path_is_absolute(param)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in condition %s is not absolute: %s", type_name, param); + } else + param = NULL; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + Condition *c; + + c = condition_new(t, param, trigger, negate); + if (!c) + return -ENOMEM; + + LIST_PREPEND(conditions, *list, c); + + if (t != CONDITION_NULL) + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s%s%s", type_name, + trigger ? "|" : "", negate ? "!" : "", param); + else + unit_write_settingf(u, flags, name, + "%s=%s%s", type_name, + trigger ? "|" : "", yes_no(!negate)); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + *list = condition_free_list(*list); + unit_write_settingf(u, flags, name, "%sNull=", is_condition ? "Condition" : "Assert"); + } + + return 1; +} + +static int bus_unit_set_transient_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + UnitDependency d = _UNIT_DEPENDENCY_INVALID; + int r; + + assert(u); + assert(name); + assert(message); + + /* Handles settings when transient units are created. This settings cannot be altered anymore after the unit + * has been created. */ + + if (streq(name, "SourcePath")) + return bus_set_transient_path(u, name, &u->source_path, message, flags, error); + + if (streq(name, "StopWhenUnneeded")) + return bus_set_transient_bool(u, name, &u->stop_when_unneeded, message, flags, error); + + if (streq(name, "RefuseManualStart")) + return bus_set_transient_bool(u, name, &u->refuse_manual_start, message, flags, error); + + if (streq(name, "RefuseManualStop")) + return bus_set_transient_bool(u, name, &u->refuse_manual_stop, message, flags, error); + + if (streq(name, "AllowIsolate")) + return bus_set_transient_bool(u, name, &u->allow_isolate, message, flags, error); + + if (streq(name, "DefaultDependencies")) + return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error); + + if (streq(name, "OnFailureJobMode")) + return bus_set_transient_job_mode(u, name, &u->on_failure_job_mode, message, flags, error); + + if (streq(name, "IgnoreOnIsolate")) + return bus_set_transient_bool(u, name, &u->ignore_on_isolate, message, flags, error); + + if (streq(name, "JobTimeoutUSec")) { + r = bus_set_transient_usec_fix_0(u, name, &u->job_timeout, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags) && !u->job_running_timeout_set) + u->job_running_timeout = u->job_timeout; + } + + if (streq(name, "JobRunningTimeoutUSec")) { + r = bus_set_transient_usec_fix_0(u, name, &u->job_running_timeout, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + u->job_running_timeout_set = true; + + return r; + } + + if (streq(name, "JobTimeoutAction")) + return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error); + + if (streq(name, "JobTimeoutRebootArgument")) + return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error); + + if (streq(name, "StartLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &u->start_limit.interval, message, flags, error); + + if (streq(name, "StartLimitBurst")) + return bus_set_transient_unsigned(u, name, &u->start_limit.burst, message, flags, error); + + if (streq(name, "StartLimitAction")) + return bus_set_transient_emergency_action(u, name, &u->start_limit_action, message, flags, error); + + if (streq(name, "FailureAction")) + return bus_set_transient_emergency_action(u, name, &u->failure_action, message, flags, error); + + if (streq(name, "SuccessAction")) + return bus_set_transient_emergency_action(u, name, &u->success_action, message, flags, error); + + if (streq(name, "FailureActionExitStatus")) + return bus_set_transient_exit_status(u, name, &u->failure_action_exit_status, message, flags, error); + + if (streq(name, "SuccessActionExitStatus")) + return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error); + + if (streq(name, "RebootArgument")) + return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error); + + if (streq(name, "CollectMode")) + return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error); + + if (streq(name, "Conditions")) + return bus_set_transient_conditions(u, name, &u->conditions, true, message, flags, error); + + if (streq(name, "Asserts")) + return bus_set_transient_conditions(u, name, &u->asserts, false, message, flags, error); + + if (streq(name, "Documentation")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!documentation_url_is_valid(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid URL in %s: %s", name, *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + u->documentation = strv_free(u->documentation); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + strv_extend_strv(&u->documentation, l, false); + + STRV_FOREACH(p, l) + unit_write_settingf(u, flags, name, "%s=%s", name, *p); + } + } + + return 1; + + } else if (streq(name, "Slice")) { + Unit *slice; + const char *s; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups."); + if (u->type == UNIT_SLICE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units."); + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope"); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!unit_name_is_valid(s, UNIT_NAME_PLAIN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name '%s'", s); + + /* Note that we do not dispatch the load queue here yet, as we don't want our own transient unit to be + * loaded while we are still setting it up. Or in other words, we use manager_load_unit_prepare() + * instead of manager_load_unit() on purpose, here. */ + r = manager_load_unit_prepare(u->manager, s, NULL, error, &slice); + if (r < 0) + return r; + + if (slice->type != UNIT_SLICE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit name '%s' is not a slice", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_set_slice(u, slice); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_PRIVATE, name, "Slice=%s", s); + } + + return 1; + + } else if (streq(name, "RequiresMountsFor")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not absolute: %s", name, *p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p); + + unit_write_settingf(u, flags, name, "%s=%s", name, *p); + } + } + + return 1; + } + + if (streq(name, "RequiresOverridable")) + d = UNIT_REQUIRES; /* redirect for obsolete unit dependency type */ + else if (streq(name, "RequisiteOverridable")) + d = UNIT_REQUISITE; /* same here */ + else + d = unit_dependency_from_string(name); + + if (d >= 0) { + const char *other; + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "s", &other)) > 0) { + if (!unit_name_is_valid(other, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name %s", other); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *label = NULL; + + r = unit_add_dependency_by_name(u, d, other, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + label = strjoin(name, "-", other); + if (!label) + return -ENOMEM; + + unit_write_settingf(u, flags, label, "%s=%s", unit_dependency_to_string(d), other); + } + + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 1; + + } else if (streq(name, "AddRef")) { + + int b; + + /* Why is this called "AddRef" rather than just "Ref", or "Reference"? There's already a "Ref()" method + * on the Unit interface, and it's probably not a good idea to expose a property and a method on the + * same interface (well, strictly speaking AddRef isn't exposed as full property, we just read it for + * transient units, but still). And "References" and "ReferencedBy" is already used as unit reference + * dependency type, hence let's not confuse things with that. + * + * Note that we don't acually add the reference to the bus track. We do that only after the setup of + * the transient unit is complete, so that setting this property multiple times in the same transient + * unit creation call doesn't count as individual references. */ + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + u->bus_track_add = b; + + return 1; + } + + return 0; +} + +int bus_unit_set_properties( + Unit *u, + sd_bus_message *message, + UnitWriteFlags flags, + bool commit, + sd_bus_error *error) { + + bool for_real = false; + unsigned n = 0; + int r; + + assert(u); + assert(message); + + /* We iterate through the array twice. First run we just check + * if all passed data is valid, second run actually applies + * it. This is to implement transaction-like behaviour without + * actually providing full transactions. */ + + r = sd_bus_message_enter_container(message, 'a', "(sv)"); + if (r < 0) + return r; + + for (;;) { + const char *name; + UnitWriteFlags f; + + r = sd_bus_message_enter_container(message, 'r', "sv"); + if (r < 0) + return r; + if (r == 0) { + if (for_real || UNIT_WRITE_FLAGS_NOOP(flags)) + break; + + /* Reached EOF. Let's try again, and this time for realz... */ + r = sd_bus_message_rewind(message, false); + if (r < 0) + return r; + + for_real = true; + continue; + } + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (!UNIT_VTABLE(u)->bus_set_property) + return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY, "Objects of this type do not support setting properties."); + + r = sd_bus_message_enter_container(message, 'v', NULL); + if (r < 0) + return r; + + /* If not for real, then mask out the two target flags */ + f = for_real ? flags : (flags & ~(UNIT_RUNTIME|UNIT_PERSISTENT)); + + r = UNIT_VTABLE(u)->bus_set_property(u, name, message, f, error); + if (r == 0 && u->transient && u->load_state == UNIT_STUB) + r = bus_unit_set_transient_property(u, name, message, f, error); + if (r == 0) + r = bus_unit_set_live_property(u, name, message, f, error); + if (r < 0) + return r; + + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY, "Cannot set property %s, or unknown property.", name); + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n += for_real; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (commit && n > 0 && UNIT_VTABLE(u)->bus_commit_properties) + UNIT_VTABLE(u)->bus_commit_properties(u); + + return n; +} + +int bus_unit_validate_load_state(Unit *u, sd_bus_error *error) { + assert(u); + + /* Generates a pretty error if a unit isn't properly loaded. */ + + switch (u->load_state) { + + case UNIT_LOADED: + return 0; + + case UNIT_NOT_FOUND: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not found.", u->id); + + case UNIT_BAD_SETTING: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, "Unit %s has a bad unit file setting.", u->id); + + case UNIT_ERROR: /* Only show .load_error in UNIT_ERROR state */ + return sd_bus_error_set_errnof(error, u->load_error, "Unit %s failed to load properly: %m.", u->id); + + case UNIT_MASKED: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit %s is masked.", u->id); + + case UNIT_STUB: + case UNIT_MERGED: + default: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unexpected load state of unit %s", u->id); + } +} + +static int bus_unit_track_handler(sd_bus_track *t, void *userdata) { + Unit *u = userdata; + + assert(t); + assert(u); + + u->bus_track = sd_bus_track_unref(u->bus_track); /* make sure we aren't called again */ + + /* If the client that tracks us disappeared, then there's reason to believe that the cgroup is empty now too, + * let's see */ + unit_add_to_cgroup_empty_queue(u); + + /* Also add the unit to the GC queue, after all if the client left it might be time to GC this unit */ + unit_add_to_gc_queue(u); + + return 0; +} + +static int bus_unit_allocate_bus_track(Unit *u) { + int r; + + assert(u); + + if (u->bus_track) + return 0; + + r = sd_bus_track_new(u->manager->api_bus, &u->bus_track, bus_unit_track_handler, u); + if (r < 0) + return r; + + r = sd_bus_track_set_recursive(u->bus_track, true); + if (r < 0) { + u->bus_track = sd_bus_track_unref(u->bus_track); + return r; + } + + return 0; +} + +int bus_unit_track_add_name(Unit *u, const char *name) { + int r; + + assert(u); + + r = bus_unit_allocate_bus_track(u); + if (r < 0) + return r; + + return sd_bus_track_add_name(u->bus_track, name); +} + +int bus_unit_track_add_sender(Unit *u, sd_bus_message *m) { + int r; + + assert(u); + + r = bus_unit_allocate_bus_track(u); + if (r < 0) + return r; + + return sd_bus_track_add_sender(u->bus_track, m); +} + +int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m) { + assert(u); + + /* If we haven't allocated the bus track object yet, then there's definitely no reference taken yet, return an + * error */ + if (!u->bus_track) + return -EUNATCH; + + return sd_bus_track_remove_sender(u->bus_track, m); +} diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h new file mode 100644 index 0000000..345345e --- /dev/null +++ b/src/core/dbus-unit.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "job.h" +#include "unit.h" + +extern const sd_bus_vtable bus_unit_vtable[]; +extern const sd_bus_vtable bus_unit_cgroup_vtable[]; + +void bus_unit_send_change_signal(Unit *u); +void bus_unit_send_pending_change_signal(Unit *u, bool including_new); +void bus_unit_send_removed_signal(Unit *u); + +int bus_unit_method_start_generic(sd_bus_message *message, Unit *u, JobType job_type, bool reload_if_possible, sd_bus_error *error); +int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error); +int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_unit_queue_job(sd_bus_message *message, Unit *u, JobType type, JobMode mode, bool reload_if_possible, sd_bus_error *error); +int bus_unit_validate_load_state(Unit *u, sd_bus_error *error); + +int bus_unit_track_add_name(Unit *u, const char *name); +int bus_unit_track_add_sender(Unit *u, sd_bus_message *m); +int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m); diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c new file mode 100644 index 0000000..f4fbb72 --- /dev/null +++ b/src/core/dbus-util.c @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "bus-util.h" +#include "dbus-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "unit-printf.h" +#include "user-util.h" +#include "unit.h" + +int bus_property_get_triggered_unit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *trigger; + + assert(bus); + assert(reply); + assert(u); + + trigger = UNIT_TRIGGER(u); + + return sd_bus_message_append(reply, "s", trigger ? trigger->id : NULL); +} + +BUS_DEFINE_SET_TRANSIENT(mode_t, "u", uint32_t, mode_t, "%040o"); +BUS_DEFINE_SET_TRANSIENT(unsigned, "u", uint32_t, unsigned, "%" PRIu32); +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user, valid_user_group_name_or_id); +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute); + +int bus_set_transient_string( + Unit *u, + const char *name, + char **p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = free_and_strdup(p, empty_to_null(v)); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", name, strempty(v)); + } + + return 1; +} + +int bus_set_transient_bool( + Unit *u, + const char *name, + bool *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int v, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v)); + } + + return 1; +} + +int bus_set_transient_usec_internal( + Unit *u, + const char *name, + usec_t *p, + bool fix_0, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint64_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char *n, ts[FORMAT_TIMESPAN_MAX]; + + if (fix_0) + *p = v != 0 ? v: USEC_INFINITY; + else + *p = v; + + n = strndupa(name, strlen(name) - 4); + unit_write_settingf(u, flags, name, "%sSec=%s", n, + format_timespan(ts, sizeof(ts), v, USEC_PER_MSEC)); + } + + return 1; +} diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h new file mode 100644 index 0000000..12b055e --- /dev/null +++ b/src/core/dbus-util.h @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" + +#include "unit.h" + +int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +#define BUS_DEFINE_SET_TRANSIENT(function, bus_type, type, cast_type, fmt) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=" fmt, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_IS_VALID(function, bus_type, type, cast_type, fmt, check) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + if (!check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=" fmt, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_TO_STRING(function, bus_type, type, cast_type, fmt, to_string) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + s = to_string(v); \ + if (!s) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(function, bus_type, type, cast_type, fmt, to_string) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + _cleanup_free_ char *s = NULL; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + r = to_string(v, &s); \ + if (r == -EINVAL) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + if (r < 0) \ + return r; \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_PARSE(function, type, parse) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &s); \ + if (r < 0) \ + return r; \ + \ + v = parse(s); \ + if (v < 0) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, s); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(function, type, parse) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &s); \ + if (r < 0) \ + return r; \ + \ + r = parse(s, &v); \ + if (r < 0) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, s); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, strempty(s)); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(function, check) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + char **p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &v); \ + if (r < 0) \ + return r; \ + \ + if (!isempty(v) && !check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + r = free_and_strdup(p, empty_to_null(v)); \ + if (r < 0) \ + return r; \ + \ + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, \ + "%s=%s", name, strempty(v)); \ + } \ + \ + return 1; \ + } + +int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_user(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); +} +static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, true, message, flags, error); +} diff --git a/src/core/dbus.c b/src/core/dbus.c new file mode 100644 index 0000000..255b86e --- /dev/null +++ b/src/core/dbus.c @@ -0,0 +1,1315 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <sys/epoll.h> +#include <unistd.h> + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-cgroup.h" +#include "dbus-device.h" +#include "dbus-execute.h" +#include "dbus-job.h" +#include "dbus-kill.h" +#include "dbus-manager.h" +#include "dbus-mount.h" +#include "dbus-path.h" +#include "dbus-scope.h" +#include "dbus-service.h" +#include "dbus-slice.h" +#include "dbus-socket.h" +#include "dbus-swap.h" +#include "dbus-target.h" +#include "dbus-timer.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "missing.h" +#include "mkdir.h" +#include "process-util.h" +#include "selinux-access.h" +#include "serialize.h" +#include "service.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "user-util.h" + +#define CONNECTIONS_MAX 4096 + +static void destroy_bus(Manager *m, sd_bus **bus); + +int bus_send_pending_reload_message(Manager *m) { + int r; + + assert(m); + + if (!m->pending_reload_message) + return 0; + + /* If we cannot get rid of this message we won't dispatch any D-Bus messages, so that we won't end up wanting + * to queue another message. */ + + r = sd_bus_send(NULL, m->pending_reload_message, NULL); + if (r < 0) + log_warning_errno(r, "Failed to send queued message, ignoring: %m"); + + m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message); + + return 0; +} + +int bus_forward_agent_released(Manager *m, const char *path) { + int r; + + assert(m); + assert(path); + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (!m->system_bus) + return 0; + + /* If we are running a system instance we forward the agent message on the system bus, so that the user + * instances get notified about this, too */ + + r = sd_bus_emit_signal(m->system_bus, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + "s", path); + if (r < 0) + return log_debug_errno(r, "Failed to propagate agent release message: %m"); + + return 1; +} + +static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Manager *m = userdata; + const char *cgroup; + uid_t sender_uid; + int r; + + assert(message); + assert(m); + + /* only accept org.freedesktop.systemd1.Agent from UID=0 */ + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0 || sender_uid != 0) + return 0; + + /* parse 'cgroup-empty' notification */ + r = sd_bus_message_read(message, "s", &cgroup); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + manager_notify_cgroup_empty(m, cgroup); + return 0; +} + +static int signal_disconnected(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + sd_bus *bus; + + assert(message); + assert(m); + assert_se(bus = sd_bus_message_get_bus(message)); + + if (bus == m->api_bus) + bus_done_api(m); + if (bus == m->system_bus) + bus_done_system(m); + + if (set_remove(m->private_buses, bus)) { + log_debug("Got disconnect on private connection."); + destroy_bus(m, &bus); + } + + return 0; +} + +static int signal_activation_request(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SERVICE) || + manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SOCKET)) { + r = sd_bus_error_setf(&error, BUS_ERROR_SHUTTING_DOWN, "Refusing activation, D-Bus is shutting down."); + goto failed; + } + + r = manager_load_unit(m, name, NULL, &error, &u); + if (r < 0) + goto failed; + + if (u->refuse_manual_start) { + r = sd_bus_error_setf(&error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id); + goto failed; + } + + r = manager_add_job(m, JOB_START, u, JOB_REPLACE, &error, NULL); + if (r < 0) + goto failed; + + /* Successfully queued, that's it for us */ + return 0; + +failed: + if (!sd_bus_error_is_set(&error)) + sd_bus_error_set_errno(&error, r); + + log_debug("D-Bus activation failed for %s: %s", name, bus_error_message(&error, r)); + + r = sd_bus_message_new_signal(sd_bus_message_get_bus(message), &reply, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Activator", "ActivationFailure"); + if (r < 0) { + bus_log_create_error(r); + return 0; + } + + r = sd_bus_message_append(reply, "sss", name, error.name, error.message); + if (r < 0) { + bus_log_create_error(r); + return 0; + } + + r = sd_bus_send_to(NULL, reply, "org.freedesktop.DBus", NULL); + if (r < 0) + return log_error_errno(r, "Failed to respond with to bus activation request: %m"); + + return 0; +} + +#if HAVE_SELINUX +static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *verb, *path; + Unit *u = NULL; + Job *j; + int r; + + assert(message); + + /* Our own method calls are all protected individually with + * selinux checks, but the built-in interfaces need to be + * protected too. */ + + if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", "Set")) + verb = "reload"; + else if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Introspectable", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.ObjectManager", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Peer", NULL)) + verb = "status"; + else + return 0; + + path = sd_bus_message_get_path(message); + + if (object_path_startswith("/org/freedesktop/systemd1", path)) { + r = mac_selinux_access_check(message, verb, error); + if (r < 0) + return r; + + return 0; + } + + if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return 0; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return 0; + + u = manager_get_unit_by_pid(m, pid); + } else { + r = manager_get_job_from_dbus_path(m, path, &j); + if (r >= 0) + u = j->unit; + else + manager_load_unit_from_dbus_path(m, path, NULL, &u); + } + if (!u) + return 0; + + r = mac_selinux_unit_access_check(u, message, verb, error); + if (r < 0) + return r; + + return 0; +} +#endif + +static int bus_job_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + Job *j; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = manager_get_job_from_dbus_path(m, path, &j); + if (r < 0) + return 0; + + *found = j; + return 1; +} + +static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_bus_error *error) { + Unit *u = NULL; /* just to appease gcc, initialization is not really necessary */ + int r; + + assert(m); + assert(bus); + assert(path); + + if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + sd_bus_message *message; + pid_t pid; + + message = sd_bus_get_current_message(bus); + if (!message) + return 0; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return 0; + } else { + r = manager_load_unit_from_dbus_path(m, path, error, &u); + if (r < 0) + return 0; + assert(u); + } + + *unit = u; + return 1; +} + +static int bus_unit_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + return find_unit(m, bus, path, (Unit**) found, error); +} + +static int bus_unit_interface_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + *found = u; + return 1; +} + +static int bus_unit_cgroup_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + *found = u; + return 1; +} + +static int bus_cgroup_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + CGroupContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_exec_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + ExecContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_exec_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_kill_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + KillContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_kill_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_job_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + unsigned k = 0; + Iterator i; + Job *j; + + l = new0(char*, hashmap_size(m->jobs)+1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(j, m->jobs, i) { + l[k] = job_dbus_path(j); + if (!l[k]) + return -ENOMEM; + + k++; + } + + assert(hashmap_size(m->jobs) == k); + + *nodes = TAKE_PTR(l); + + return k; +} + +static int bus_unit_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + unsigned k = 0; + Iterator i; + Unit *u; + + l = new0(char*, hashmap_size(m->units)+1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(u, m->units, i) { + l[k] = unit_dbus_path(u); + if (!l[k]) + return -ENOMEM; + + k++; + } + + *nodes = TAKE_PTR(l); + + return k; +} + +static int bus_setup_api_vtables(Manager *m, sd_bus *bus) { + UnitType t; + int r; + + assert(m); + assert(bus); + +#if HAVE_SELINUX + r = sd_bus_add_filter(bus, NULL, mac_selinux_filter, m); + if (r < 0) + return log_error_errno(r, "Failed to add SELinux access filter: %m"); +#endif + + r = sd_bus_add_object_vtable(bus, NULL, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", bus_manager_vtable, m); + if (r < 0) + return log_error_errno(r, "Failed to register Manager vtable: %m"); + + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/job", "org.freedesktop.systemd1.Job", bus_job_vtable, bus_job_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register Job vtable: %m"); + + r = sd_bus_add_node_enumerator(bus, NULL, "/org/freedesktop/systemd1/job", bus_job_enumerate, m); + if (r < 0) + return log_error_errno(r, "Failed to add job enumerator: %m"); + + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/unit", "org.freedesktop.systemd1.Unit", bus_unit_vtable, bus_unit_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register Unit vtable: %m"); + + r = sd_bus_add_node_enumerator(bus, NULL, "/org/freedesktop/systemd1/unit", bus_unit_enumerate, m); + if (r < 0) + return log_error_errno(r, "Failed to add job enumerator: %m"); + + for (t = 0; t < _UNIT_TYPE_MAX; t++) { + const char *interface; + + assert_se(interface = unit_dbus_interface_from_type(t)); + + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/unit", interface, unit_vtable[t]->bus_vtable, bus_unit_interface_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register type specific vtable for %s: %m", interface); + + if (unit_vtable[t]->cgroup_context_offset > 0) { + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/unit", interface, bus_unit_cgroup_vtable, bus_unit_cgroup_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register control group unit vtable for %s: %m", interface); + + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/unit", interface, bus_cgroup_vtable, bus_cgroup_context_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register control group vtable for %s: %m", interface); + } + + if (unit_vtable[t]->exec_context_offset > 0) { + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/unit", interface, bus_exec_vtable, bus_exec_context_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register execute vtable for %s: %m", interface); + } + + if (unit_vtable[t]->kill_context_offset > 0) { + r = sd_bus_add_fallback_vtable(bus, NULL, "/org/freedesktop/systemd1/unit", interface, bus_kill_vtable, bus_kill_context_find, m); + if (r < 0) + return log_error_errno(r, "Failed to register kill vtable for %s: %m", interface); + } + } + + return 0; +} + +static int bus_setup_disconnected_match(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.DBus.Local", + "/org/freedesktop/DBus/Local", + "org.freedesktop.DBus.Local", + "Disconnected", + signal_disconnected, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for Disconnected message: %m"); + + return 0; +} + +static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int nfd = -1; + Manager *m = userdata; + sd_id128_t id; + int r; + + assert(s); + assert(m); + + nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (nfd < 0) { + log_warning_errno(errno, "Failed to accept private connection, ignoring: %m"); + return 0; + } + + if (set_size(m->private_buses) >= CONNECTIONS_MAX) { + log_warning("Too many concurrent connections, refusing"); + return 0; + } + + r = set_ensure_allocated(&m->private_buses, NULL); + if (r < 0) { + log_oom(); + return 0; + } + + r = sd_bus_new(&bus); + if (r < 0) { + log_warning_errno(r, "Failed to allocate new private connection bus: %m"); + return 0; + } + + (void) sd_bus_set_description(bus, "private-bus-connection"); + + r = sd_bus_set_fd(bus, nfd, nfd); + if (r < 0) { + log_warning_errno(r, "Failed to set fd on new connection bus: %m"); + return 0; + } + + nfd = -1; + + r = bus_check_peercred(bus); + if (r < 0) { + log_warning_errno(r, "Incoming private connection from unprivileged client, refusing: %m"); + return 0; + } + + assert_se(sd_id128_randomize(&id) >= 0); + + r = sd_bus_set_server(bus, 1, id); + if (r < 0) { + log_warning_errno(r, "Failed to enable server support for new connection bus: %m"); + return 0; + } + + r = sd_bus_negotiate_creds(bus, 1, + SD_BUS_CREDS_PID|SD_BUS_CREDS_UID| + SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT); + if (r < 0) { + log_warning_errno(r, "Failed to enable credentials for new connection: %m"); + return 0; + } + + r = sd_bus_set_sender(bus, "org.freedesktop.systemd1"); + if (r < 0) { + log_warning_errno(r, "Failed to set direct connection sender: %m"); + return 0; + } + + r = sd_bus_start(bus); + if (r < 0) { + log_warning_errno(r, "Failed to start new connection bus: %m"); + return 0; + } + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) { + log_warning_errno(r, "Failed to attach new connection bus to event loop: %m"); + return 0; + } + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return 0; + + r = bus_setup_api_vtables(m, bus); + if (r < 0) { + log_warning_errno(r, "Failed to set up API vtables on new connection bus: %m"); + return 0; + } + + r = set_put(m->private_buses, bus); + if (r < 0) { + log_warning_errno(r, "Failed to add new connection bus to set: %m"); + return 0; + } + + bus = NULL; + + log_debug("Accepted new private connection."); + + return 0; +} + +static int manager_dispatch_sync_bus_names(sd_event_source *es, void *userdata) { + _cleanup_strv_free_ char **names = NULL; + Manager *m = userdata; + const char *name; + Iterator i; + Unit *u; + int r; + + assert(es); + assert(m); + assert(m->sync_bus_names_event_source == es); + + /* First things first, destroy the defer event so that we aren't triggered again */ + m->sync_bus_names_event_source = sd_event_source_unref(m->sync_bus_names_event_source); + + /* Let's see if there's anything to do still? */ + if (!m->api_bus) + return 0; + if (hashmap_isempty(m->watch_bus)) + return 0; + + /* OK, let's sync up the names. Let's see which names are currently on the bus. */ + r = sd_bus_list_names(m->api_bus, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to get initial list of names: %m"); + + /* We have to synchronize the current bus names with the + * list of active services. To do this, walk the list of + * all units with bus names. */ + HASHMAP_FOREACH_KEY(u, name, m->watch_bus, i) { + Service *s = SERVICE(u); + + assert(s); + + if (!streq_ptr(s->bus_name, name)) { + log_unit_warning(u, "Bus name has changed from %s → %s, ignoring.", s->bus_name, name); + continue; + } + + /* Check if a service's bus name is in the list of currently + * active names */ + if (strv_contains(names, name)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *unique; + + /* If it is, determine its current owner */ + r = sd_bus_get_name_creds(m->api_bus, name, SD_BUS_CREDS_UNIQUE_NAME, &creds); + if (r < 0) { + log_full_errno(r == -ENXIO ? LOG_DEBUG : LOG_ERR, r, "Failed to get bus name owner %s: %m", name); + continue; + } + + r = sd_bus_creds_get_unique_name(creds, &unique); + if (r < 0) { + log_full_errno(r == -ENXIO ? LOG_DEBUG : LOG_ERR, r, "Failed to get unique name for %s: %m", name); + continue; + } + + /* Now, let's compare that to the previous bus owner, and + * if it's still the same, all is fine, so just don't + * bother the service. Otherwise, the name has apparently + * changed, so synthesize a name owner changed signal. */ + + if (!streq_ptr(unique, s->bus_name_owner)) + UNIT_VTABLE(u)->bus_name_owner_change(u, name, s->bus_name_owner, unique); + } else { + /* So, the name we're watching is not on the bus. + * This either means it simply hasn't appeared yet, + * or it was lost during the daemon reload. + * Check if the service has a stored name owner, + * and synthesize a name loss signal in this case. */ + + if (s->bus_name_owner) + UNIT_VTABLE(u)->bus_name_owner_change(u, name, s->bus_name_owner, NULL); + } + } + + return 0; +} + +int manager_enqueue_sync_bus_names(Manager *m) { + int r; + + assert(m); + + /* Enqueues a request to synchronize the bus names in a later event loop iteration. The callers generally don't + * want us to invoke ->bus_name_owner_change() unit calls from their stack frames as this might result in event + * dispatching on its own creating loops, hence we simply create a defer event for the event loop and exit. */ + + if (m->sync_bus_names_event_source) + return 0; + + r = sd_event_add_defer(m->event, &m->sync_bus_names_event_source, manager_dispatch_sync_bus_names, m); + if (r < 0) + return log_error_errno(r, "Failed to create bus name synchronization event: %m"); + + r = sd_event_source_set_priority(m->sync_bus_names_event_source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to set event priority: %m"); + + r = sd_event_source_set_enabled(m->sync_bus_names_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to set even to oneshot: %m"); + + (void) sd_event_source_set_description(m->sync_bus_names_event_source, "manager-sync-bus-names"); + return 0; +} + +static int bus_setup_api(Manager *m, sd_bus *bus) { + Iterator i; + char *name; + Unit *u; + int r; + + assert(m); + assert(bus); + + /* Let's make sure we have enough credential bits so that we can make security and selinux decisions */ + r = sd_bus_negotiate_creds(bus, 1, + SD_BUS_CREDS_PID|SD_BUS_CREDS_UID| + SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT); + if (r < 0) + log_warning_errno(r, "Failed to enable credential passing, ignoring: %m"); + + r = bus_setup_api_vtables(m, bus); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(u, name, m->watch_bus, i) { + r = unit_install_bus_match(u, bus, name); + if (r < 0) + log_error_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name); + } + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.systemd1.Activator", + "ActivationRequest", + signal_activation_request, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to subscribe to activation signal: %m"); + + /* Allow replacing of our name, to ease implementation of reexecution, where we keep the old connection open + * until after the new connection is set up and the name installed to allow clients to synchronously wait for + * reexecution to finish */ + r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.systemd1", SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_ALLOW_REPLACEMENT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + log_debug("Successfully connected to API bus."); + + return 0; +} + +int bus_init_api(Manager *m) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + if (m->api_bus) + return 0; + + /* The API and system bus is the same if we are running in system mode */ + if (MANAGER_IS_SYSTEM(m) && m->system_bus) + bus = sd_bus_ref(m->system_bus); + else { + if (MANAGER_IS_SYSTEM(m)) + r = sd_bus_open_system_with_description(&bus, "bus-api-system"); + else + r = sd_bus_open_user_with_description(&bus, "bus-api-user"); + if (r < 0) + return log_error_errno(r, "Failed to connect to API bus: %m"); + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach API bus to event loop: %m"); + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return r; + } + + r = bus_setup_api(m, bus); + if (r < 0) + return log_error_errno(r, "Failed to set up API bus: %m"); + + m->api_bus = TAKE_PTR(bus); + + r = manager_enqueue_sync_bus_names(m); + if (r < 0) + return r; + + return 0; +} + +static int bus_setup_system(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + + /* if we are a user instance we get the Released message via the system bus */ + if (MANAGER_IS_USER(m)) { + r = sd_bus_match_signal_async( + bus, + NULL, + NULL, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + signal_agent_released, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to request Released match on system bus: %m"); + } + + log_debug("Successfully connected to system bus."); + return 0; +} + +int bus_init_system(Manager *m) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + if (m->system_bus) + return 0; + + /* The API and system bus is the same if we are running in system mode */ + if (MANAGER_IS_SYSTEM(m) && m->api_bus) + bus = sd_bus_ref(m->api_bus); + else { + r = sd_bus_open_system_with_description(&bus, "bus-system"); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach system bus to event loop: %m"); + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return r; + } + + r = bus_setup_system(m, bus); + if (r < 0) + return log_error_errno(r, "Failed to set up system bus: %m"); + + m->system_bus = TAKE_PTR(bus); + + return 0; +} + +int bus_init_private(Manager *m) { + _cleanup_close_ int fd = -1; + union sockaddr_union sa = {}; + sd_event_source *s; + int r, salen; + + assert(m); + + if (m->private_listen_fd >= 0) + return 0; + + if (MANAGER_IS_SYSTEM(m)) { + + /* We want the private bus only when running as init */ + if (getpid_cached() != 1) + return 0; + + salen = sockaddr_un_set_path(&sa.un, "/run/systemd/private"); + } else { + const char *e, *joined; + + e = secure_getenv("XDG_RUNTIME_DIR"); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "XDG_RUNTIME_DIR is not set, refusing."); + + joined = strjoina(e, "/systemd/private"); + salen = sockaddr_un_set_path(&sa.un, joined); + } + if (salen < 0) + return log_error_errno(salen, "Can't set path for AF_UNIX socket to bind to: %m"); + + (void) mkdir_parents_label(sa.un.sun_path, 0755); + (void) sockaddr_un_unlink(&sa.un); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate private socket: %m"); + + r = bind(fd, &sa.sa, salen); + if (r < 0) + return log_error_errno(errno, "Failed to bind private socket: %m"); + + r = listen(fd, SOMAXCONN); + if (r < 0) + return log_error_errno(errno, "Failed to make private socket listening: %m"); + + /* Generate an inotify event in case somebody waits for this socket to appear using inotify() */ + (void) touch(sa.un.sun_path); + + r = sd_event_add_io(m->event, &s, fd, EPOLLIN, bus_on_connection, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source: %m"); + + (void) sd_event_source_set_description(s, "bus-connection"); + + m->private_listen_fd = TAKE_FD(fd); + m->private_listen_event_source = s; + + log_debug("Successfully created private D-Bus server."); + + return 0; +} + +static void destroy_bus(Manager *m, sd_bus **bus) { + Iterator i; + Unit *u; + Job *j; + + assert(m); + assert(bus); + + if (!*bus) + return; + + /* Make sure all bus slots watching names are released. */ + HASHMAP_FOREACH(u, m->watch_bus, i) { + if (!u->match_bus_slot) + continue; + + if (sd_bus_slot_get_bus(u->match_bus_slot) != *bus) + continue; + + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + } + + /* Get rid of tracked clients on this bus */ + if (m->subscribed && sd_bus_track_get_bus(m->subscribed) == *bus) + m->subscribed = sd_bus_track_unref(m->subscribed); + + HASHMAP_FOREACH(j, m->jobs, i) + if (j->bus_track && sd_bus_track_get_bus(j->bus_track) == *bus) + j->bus_track = sd_bus_track_unref(j->bus_track); + + HASHMAP_FOREACH(u, m->units, i) + if (u->bus_track && sd_bus_track_get_bus(u->bus_track) == *bus) + u->bus_track = sd_bus_track_unref(u->bus_track); + + /* Get rid of queued message on this bus */ + if (m->pending_reload_message && sd_bus_message_get_bus(m->pending_reload_message) == *bus) + m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message); + + /* Possibly flush unwritten data, but only if we are + * unprivileged, since we don't want to sync here */ + if (!MANAGER_IS_SYSTEM(m)) + sd_bus_flush(*bus); + + /* And destroy the object */ + *bus = sd_bus_close_unref(*bus); +} + +void bus_done_api(Manager *m) { + destroy_bus(m, &m->api_bus); +} + +void bus_done_system(Manager *m) { + destroy_bus(m, &m->system_bus); +} + +void bus_done_private(Manager *m) { + sd_bus *b; + + assert(m); + + while ((b = set_steal_first(m->private_buses))) + destroy_bus(m, &b); + + m->private_buses = set_free(m->private_buses); + + m->private_listen_event_source = sd_event_source_unref(m->private_listen_event_source); + m->private_listen_fd = safe_close(m->private_listen_fd); +} + +void bus_done(Manager *m) { + assert(m); + + bus_done_api(m); + bus_done_system(m); + bus_done_private(m); + + assert(!m->subscribed); + + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + bus_verify_polkit_async_registry_free(m->polkit_registry); +} + +int bus_fdset_add_all(Manager *m, FDSet *fds) { + Iterator i; + sd_bus *b; + int fd; + + assert(m); + assert(fds); + + /* When we are about to reexecute we add all D-Bus fds to the + * set to pass over to the newly executed systemd. They won't + * be used there however, except thatt they are closed at the + * very end of deserialization, those making it possible for + * clients to synchronously wait for systemd to reexec by + * simply waiting for disconnection */ + + if (m->api_bus) { + fd = sd_bus_get_fd(m->api_bus); + if (fd >= 0) { + fd = fdset_put_dup(fds, fd); + if (fd < 0) + return fd; + } + } + + SET_FOREACH(b, m->private_buses, i) { + fd = sd_bus_get_fd(b); + if (fd >= 0) { + fd = fdset_put_dup(fds, fd); + if (fd < 0) + return fd; + } + } + + /* We don't offer any APIs on the system bus (well, unless it + * is the same as the API bus) hence we don't bother with it + * here */ + + return 0; +} + +int bus_foreach_bus( + Manager *m, + sd_bus_track *subscribed2, + int (*send_message)(sd_bus *bus, void *userdata), + void *userdata) { + + Iterator i; + sd_bus *b; + int r, ret = 0; + + /* Send to all direct buses, unconditionally */ + SET_FOREACH(b, m->private_buses, i) { + + /* Don't bother with enqueing these messages to clients that haven't started yet */ + if (sd_bus_is_ready(b) <= 0) + continue; + + r = send_message(b, userdata); + if (r < 0) + ret = r; + } + + /* Send to API bus, but only if somebody is subscribed */ + if (m->api_bus && + (sd_bus_track_count(m->subscribed) > 0 || + sd_bus_track_count(subscribed2) > 0)) { + r = send_message(m->api_bus, userdata); + if (r < 0) + ret = r; + } + + return ret; +} + +void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) { + const char *n; + + assert(f); + assert(prefix); + + for (n = sd_bus_track_first(t); n; n = sd_bus_track_next(t)) { + int c, j; + + c = sd_bus_track_count_name(t, n); + for (j = 0; j < c; j++) + (void) serialize_item(f, prefix, n); + } +} + +int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) { + int r = 0; + + assert(m); + assert(t); + + if (strv_isempty(l)) + return 0; + + if (!m->api_bus) + return 0; + + if (!*t) { + r = sd_bus_track_new(m->api_bus, t, NULL, NULL); + if (r < 0) + return r; + } + + r = sd_bus_track_set_recursive(*t, recursive); + if (r < 0) + return r; + + return bus_track_add_name_many(*t, l); +} + +int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +uint64_t manager_bus_n_queued_write(Manager *m) { + uint64_t c = 0; + Iterator i; + sd_bus *b; + int r; + + /* Returns the total number of messages queued for writing on all our direct and API busses. */ + + SET_FOREACH(b, m->private_buses, i) { + uint64_t k; + + r = sd_bus_get_n_queued_write(b, &k); + if (r < 0) + log_debug_errno(r, "Failed to query queued messages for private bus: %m"); + else + c += k; + } + + if (m->api_bus) { + uint64_t k; + + r = sd_bus_get_n_queued_write(m->api_bus, &k); + if (r < 0) + log_debug_errno(r, "Failed to query queued messages for API bus: %m"); + else + c += k; + } + + return c; +} + +static void vtable_dump_bus_properties(FILE *f, const sd_bus_vtable *table) { + const sd_bus_vtable *i; + + for (i = table; i->type != _SD_BUS_VTABLE_END; i++) { + if (!IN_SET(i->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY) || + (i->flags & (SD_BUS_VTABLE_DEPRECATED | SD_BUS_VTABLE_HIDDEN)) != 0) + continue; + + fprintf(f, "%s\n", i->x.property.member); + } +} + +void dump_bus_properties(FILE *f) { + assert(f); + + vtable_dump_bus_properties(f, bus_automount_vtable); + vtable_dump_bus_properties(f, bus_cgroup_vtable); + vtable_dump_bus_properties(f, bus_device_vtable); + vtable_dump_bus_properties(f, bus_exec_vtable); + vtable_dump_bus_properties(f, bus_job_vtable); + vtable_dump_bus_properties(f, bus_kill_vtable); + vtable_dump_bus_properties(f, bus_manager_vtable); + vtable_dump_bus_properties(f, bus_mount_vtable); + vtable_dump_bus_properties(f, bus_path_vtable); + vtable_dump_bus_properties(f, bus_scope_vtable); + vtable_dump_bus_properties(f, bus_service_vtable); + vtable_dump_bus_properties(f, bus_slice_vtable); + vtable_dump_bus_properties(f, bus_socket_vtable); + vtable_dump_bus_properties(f, bus_swap_vtable); + vtable_dump_bus_properties(f, bus_target_vtable); + vtable_dump_bus_properties(f, bus_timer_vtable); + vtable_dump_bus_properties(f, bus_unit_vtable); + vtable_dump_bus_properties(f, bus_unit_cgroup_vtable); +} diff --git a/src/core/dbus.h b/src/core/dbus.h new file mode 100644 index 0000000..f1c0fa8 --- /dev/null +++ b/src/core/dbus.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" + +#include "manager.h" + +int bus_send_pending_reload_message(Manager *m); + +int bus_init_private(Manager *m); +int bus_init_api(Manager *m); +int bus_init_system(Manager *m); + +void bus_done_private(Manager *m); +void bus_done_api(Manager *m); +void bus_done_system(Manager *m); +void bus_done(Manager *m); + +int bus_fdset_add_all(Manager *m, FDSet *fds); + +void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix); +int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l); + +int manager_enqueue_sync_bus_names(Manager *m); + +int bus_foreach_bus(Manager *m, sd_bus_track *subscribed2, int (*send_message)(sd_bus *bus, void *userdata), void *userdata); + +int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error); + +int bus_forward_agent_released(Manager *m, const char *path); + +uint64_t manager_bus_n_queued_write(Manager *m); + +void dump_bus_properties(FILE *f); diff --git a/src/core/device.c b/src/core/device.c new file mode 100644 index 0000000..b006add --- /dev/null +++ b/src/core/device.c @@ -0,0 +1,1093 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <sys/epoll.h> + +#include "alloc-util.h" +#include "bus-error.h" +#include "dbus-device.h" +#include "dbus-unit.h" +#include "device-private.h" +#include "device-util.h" +#include "device.h" +#include "log.h" +#include "parse-util.h" +#include "path-util.h" +#include "serialize.h" +#include "stat-util.h" +#include "string-util.h" +#include "swap.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_DEVICE_STATE_MAX] = { + [DEVICE_DEAD] = UNIT_INACTIVE, + [DEVICE_TENTATIVE] = UNIT_ACTIVATING, + [DEVICE_PLUGGED] = UNIT_ACTIVE, +}; + +static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata); +static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask); + +static void device_unset_sysfs(Device *d) { + Hashmap *devices; + Device *first; + + assert(d); + + if (!d->sysfs) + return; + + /* Remove this unit from the chain of devices which share the + * same sysfs path. */ + devices = UNIT(d)->manager->devices_by_sysfs; + first = hashmap_get(devices, d->sysfs); + LIST_REMOVE(same_sysfs, first, d); + + if (first) + hashmap_remove_and_replace(devices, d->sysfs, first->sysfs, first); + else + hashmap_remove(devices, d->sysfs); + + d->sysfs = mfree(d->sysfs); +} + +static int device_set_sysfs(Device *d, const char *sysfs) { + _cleanup_free_ char *copy = NULL; + Device *first; + int r; + + assert(d); + + if (streq_ptr(d->sysfs, sysfs)) + return 0; + + r = hashmap_ensure_allocated(&UNIT(d)->manager->devices_by_sysfs, &path_hash_ops); + if (r < 0) + return r; + + copy = strdup(sysfs); + if (!copy) + return -ENOMEM; + + device_unset_sysfs(d); + + first = hashmap_get(UNIT(d)->manager->devices_by_sysfs, sysfs); + LIST_PREPEND(same_sysfs, first, d); + + r = hashmap_replace(UNIT(d)->manager->devices_by_sysfs, copy, first); + if (r < 0) { + LIST_REMOVE(same_sysfs, first, d); + return r; + } + + d->sysfs = TAKE_PTR(copy); + return 0; +} + +static void device_init(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + assert(UNIT(d)->load_state == UNIT_STUB); + + /* In contrast to all other unit types we timeout jobs waiting + * for devices by default. This is because they otherwise wait + * indefinitely for plugged in devices, something which cannot + * happen for the other units since their operations time out + * anyway. */ + u->job_running_timeout = u->manager->default_timeout_start_usec; + + u->ignore_on_isolate = true; + + d->deserialized_state = _DEVICE_STATE_INVALID; +} + +static void device_done(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + + device_unset_sysfs(d); + d->wants_property = strv_free(d->wants_property); +} + +static int device_load(Unit *u) { + int r; + + r = unit_load_fragment_and_dropin_optional(u); + if (r < 0) + return r; + + if (!u->description) { + /* Generate a description based on the path, to be used until the + device is initialized properly */ + r = unit_name_to_path(u->id, &u->description); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to unescape name: %m"); + } + + return 0; +} + +static void device_set_state(Device *d, DeviceState state) { + DeviceState old_state; + assert(d); + + if (d->state != state) + bus_unit_send_pending_change_signal(UNIT(d), false); + + old_state = d->state; + d->state = state; + + if (state == DEVICE_DEAD) + device_unset_sysfs(d); + + if (state != old_state) + log_unit_debug(UNIT(d), "Changed %s -> %s", device_state_to_string(old_state), device_state_to_string(state)); + + unit_notify(UNIT(d), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int device_coldplug(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + assert(d->state == DEVICE_DEAD); + + /* First, let's put the deserialized state and found mask into effect, if we have it. */ + + if (d->deserialized_state < 0 || + (d->deserialized_state == d->state && + d->deserialized_found == d->found)) + return 0; + + d->found = d->deserialized_found; + device_set_state(d, d->deserialized_state); + return 0; +} + +static void device_catchup(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + + /* Second, let's update the state with the enumerated state if it's different */ + if (d->enumerated_found == d->found) + return; + + device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK); +} + +static const struct { + DeviceFound flag; + const char *name; +} device_found_map[] = { + { DEVICE_FOUND_UDEV, "found-udev" }, + { DEVICE_FOUND_MOUNT, "found-mount" }, + { DEVICE_FOUND_SWAP, "found-swap" }, +}; + +static int device_found_to_string_many(DeviceFound flags, char **ret) { + _cleanup_free_ char *s = NULL; + unsigned i; + + assert(ret); + + for (i = 0; i < ELEMENTSOF(device_found_map); i++) { + if (!FLAGS_SET(flags, device_found_map[i].flag)) + continue; + + if (!strextend_with_separator(&s, ",", device_found_map[i].name, NULL)) + return -ENOMEM; + } + + *ret = TAKE_PTR(s); + + return 0; +} + +static int device_found_from_string_many(const char *name, DeviceFound *ret) { + DeviceFound flags = 0; + int r; + + assert(ret); + + for (;;) { + _cleanup_free_ char *word = NULL; + DeviceFound f = 0; + unsigned i; + + r = extract_first_word(&name, &word, ",", 0); + if (r < 0) + return r; + if (r == 0) + break; + + for (i = 0; i < ELEMENTSOF(device_found_map); i++) + if (streq(word, device_found_map[i].name)) { + f = device_found_map[i].flag; + break; + } + + if (f == 0) + return -EINVAL; + + flags |= f; + } + + *ret = flags; + return 0; +} + +static int device_serialize(Unit *u, FILE *f, FDSet *fds) { + _cleanup_free_ char *s = NULL; + Device *d = DEVICE(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", device_state_to_string(d->state)); + + if (device_found_to_string_many(d->found, &s) >= 0) + (void) serialize_item(f, "found", s); + + return 0; +} + +static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Device *d = DEVICE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + DeviceState state; + + state = device_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value, ignoring: %s", value); + else + d->deserialized_state = state; + + } else if (streq(key, "found")) { + r = device_found_from_string_many(value, &d->deserialized_found); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse found value '%s', ignoring: %m", value); + + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void device_dump(Unit *u, FILE *f, const char *prefix) { + Device *d = DEVICE(u); + _cleanup_free_ char *s = NULL; + + assert(d); + + (void) device_found_to_string_many(d->found, &s); + + fprintf(f, + "%sDevice State: %s\n" + "%sSysfs Path: %s\n" + "%sFound: %s\n", + prefix, device_state_to_string(d->state), + prefix, strna(d->sysfs), + prefix, strna(s)); + + if (!strv_isempty(d->wants_property)) { + char **i; + + STRV_FOREACH(i, d->wants_property) + fprintf(f, "%sudev SYSTEMD_WANTS: %s\n", + prefix, *i); + } +} + +_pure_ static UnitActiveState device_active_state(Unit *u) { + assert(u); + + return state_translation_table[DEVICE(u)->state]; +} + +_pure_ static const char *device_sub_state_to_string(Unit *u) { + assert(u); + + return device_state_to_string(DEVICE(u)->state); +} + +static int device_update_description(Unit *u, sd_device *dev, const char *path) { + _cleanup_free_ char *j = NULL; + const char *model, *label, *desc; + int r; + + assert(u); + assert(path); + + desc = path; + + if (dev && + (sd_device_get_property_value(dev, "ID_MODEL_FROM_DATABASE", &model) >= 0 || + sd_device_get_property_value(dev, "ID_MODEL", &model) >= 0)) { + desc = model; + + /* Try to concatenate the device model string with a label, if there is one */ + if (sd_device_get_property_value(dev, "ID_FS_LABEL", &label) >= 0 || + sd_device_get_property_value(dev, "ID_PART_ENTRY_NAME", &label) >= 0 || + sd_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER", &label) >= 0) { + + desc = j = strjoin(model, " ", label); + if (!j) + return log_oom(); + } + } + + r = unit_set_description(u, desc); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set device description: %m"); + + return 0; +} + +static int device_add_udev_wants(Unit *u, sd_device *dev) { + _cleanup_strv_free_ char **added = NULL; + const char *wants, *property; + Device *d = DEVICE(u); + int r; + + assert(d); + assert(dev); + + property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS"; + + r = sd_device_get_property_value(dev, property, &wants); + if (r < 0) + return 0; + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&wants, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to parse property %s with value %s: %m", property, wants); + + if (unit_name_is_valid(word, UNIT_NAME_TEMPLATE) && d->sysfs) { + _cleanup_free_ char *escaped = NULL; + + /* If the unit name is specified as template, then automatically fill in the sysfs path of the + * device as instance name, properly escaped. */ + + r = unit_name_path_escape(d->sysfs, &escaped); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to escape %s: %m", d->sysfs); + + r = unit_name_replace_instance(word, escaped, &k); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to build %s instance of template %s: %m", escaped, word); + } else { + /* If this is not a template, then let's mangle it so, that it becomes a valid unit name. */ + + r = unit_name_mangle(word, UNIT_NAME_MANGLE_WARN, &k); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to mangle unit name \"%s\": %m", word); + } + + r = unit_add_dependency_by_name(u, UNIT_WANTS, k, true, UNIT_DEPENDENCY_UDEV); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add Wants= dependency: %m"); + + r = strv_push(&added, k); + if (r < 0) + return log_oom(); + + k = NULL; + } + + if (d->state != DEVICE_DEAD) { + char **i; + + /* So here's a special hack, to compensate for the fact that the udev database's reload cycles are not + * synchronized with our own reload cycles: when we detect that the SYSTEMD_WANTS property of a device + * changes while the device unit is already up, let's manually trigger any new units listed in it not + * seen before. This typically appens during the boot-time switch root transition, as udev devices + * will generally already be up in the initrd, but SYSTEMD_WANTS properties get then added through udev + * rules only available on the host system, and thus only when the initial udev coldplug trigger runs. + * + * We do this only if the device has been up already when we parse this, as otherwise the usual + * dependency logic that is run from the dead → plugged transition will trigger these deps. */ + + STRV_FOREACH(i, added) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (strv_contains(d->wants_property, *i)) /* Was this unit already listed before? */ + continue; + + r = manager_add_job_by_name(u->manager, JOB_START, *i, JOB_FAIL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue SYSTEMD_WANTS= job, ignoring: %s", bus_error_message(&error, r)); + } + } + + strv_free(d->wants_property); + d->wants_property = TAKE_PTR(added); + + return 0; +} + +static bool device_is_bound_by_mounts(Device *d, sd_device *dev) { + const char *bound_by; + int r; + + assert(d); + assert(dev); + + if (sd_device_get_property_value(dev, "SYSTEMD_MOUNT_DEVICE_BOUND", &bound_by) >= 0) { + r = parse_boolean(bound_by); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_MOUNT_DEVICE_BOUND='%s' udev property, ignoring: %m", bound_by); + + d->bind_mounts = r > 0; + } else + d->bind_mounts = false; + + return d->bind_mounts; +} + +static void device_upgrade_mount_deps(Unit *u) { + Unit *other; + Iterator i; + void *v; + int r; + + /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */ + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REQUIRED_BY], i) { + if (other->type != UNIT_MOUNT) + continue; + + r = unit_add_dependency(other, UNIT_BINDS_TO, u, true, UNIT_DEPENDENCY_UDEV); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to add BindsTo= dependency between device and mount unit, ignoring: %m"); + } +} + +static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool main) { + _cleanup_free_ char *e = NULL; + const char *sysfs = NULL; + Unit *u = NULL; + bool delete; + int r; + + assert(m); + assert(path); + + if (dev) { + r = sd_device_get_syspath(dev, &sysfs); + if (r < 0) { + log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m"); + return 0; + } + } + + r = unit_name_from_path(path, ".device", &e); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to generate unit name from device path: %m"); + + u = manager_get_unit(m, e); + if (u) { + /* The device unit can still be present even if the device was unplugged: a mount unit can reference it + * hence preventing the GC to have garbaged it. That's desired since the device unit may have a + * dependency on the mount unit which was added during the loading of the later. When the device is + * plugged the sysfs might not be initialized yet, as we serialize the device's state but do not + * serialize the sysfs path across reloads/reexecs. Hence, when coming back from a reload/restart we + * might have the state valid, but not the sysfs path. Hence, let's filter out conflicting devices, but + * let's accept devices in any state with no sysfs path set. */ + + if (DEVICE(u)->state == DEVICE_PLUGGED && + DEVICE(u)->sysfs && + sysfs && + !path_equal(DEVICE(u)->sysfs, sysfs)) { + log_unit_debug(u, "Device %s appeared twice with different sysfs paths %s and %s, ignoring the latter.", + e, DEVICE(u)->sysfs, sysfs); + return -EEXIST; + } + + delete = false; + + /* Let's remove all dependencies generated due to udev properties. We'll readd whatever is configured + * now below. */ + unit_remove_dependencies(u, UNIT_DEPENDENCY_UDEV); + } else { + delete = true; + + r = unit_new_for_name(m, sizeof(Device), e, &u); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to allocate device unit %s: %m", e); + goto fail; + } + + unit_add_to_load_queue(u); + } + + /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be + * initialized. Hence initialize it if necessary. */ + if (sysfs) { + r = device_set_sysfs(DEVICE(u), sysfs); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs); + goto fail; + } + + /* The additional systemd udev properties we only interpret for the main object */ + if (main) + (void) device_add_udev_wants(u, dev); + } + + (void) device_update_description(u, dev, path); + + /* So the user wants the mount units to be bound to the device but a mount unit might has been seen by systemd + * before the device appears on its radar. In this case the device unit is partially initialized and includes + * the deps on the mount unit but at that time the "bind mounts" flag wasn't not present. Fix this up now. */ + if (dev && device_is_bound_by_mounts(DEVICE(u), dev)) + device_upgrade_mount_deps(u); + + /* Note that this won't dispatch the load queue, the caller has to do that if needed and appropriate */ + unit_add_to_dbus_queue(u); + + return 0; + +fail: + if (delete) + unit_free(u); + + return r; +} + +static int device_process_new(Manager *m, sd_device *dev) { + const char *sysfs, *dn, *alias; + dev_t devnum; + int r; + + assert(m); + + if (sd_device_get_syspath(dev, &sysfs) < 0) + return 0; + + /* Add the main unit named after the sysfs path */ + r = device_setup_unit(m, dev, sysfs, true); + if (r < 0) + return r; + + /* Add an additional unit for the device node */ + if (sd_device_get_devname(dev, &dn) >= 0) + (void) device_setup_unit(m, dev, dn, false); + + /* Add additional units for all symlinks */ + if (sd_device_get_devnum(dev, &devnum) >= 0) { + const char *p; + + FOREACH_DEVICE_DEVLINK(dev, p) { + struct stat st; + + if (PATH_STARTSWITH_SET(p, "/dev/block/", "/dev/char/")) + continue; + + /* Verify that the symlink in the FS actually belongs + * to this device. This is useful to deal with + * conflicting devices, e.g. when two disks want the + * same /dev/disk/by-label/xxx link because they have + * the same label. We want to make sure that the same + * device that won the symlink wins in systemd, so we + * check the device node major/minor */ + if (stat(p, &st) >= 0 && + ((!S_ISBLK(st.st_mode) && !S_ISCHR(st.st_mode)) || + st.st_rdev != devnum)) + continue; + + (void) device_setup_unit(m, dev, p, false); + } + } + + /* Add additional units for all explicitly configured aliases */ + if (sd_device_get_property_value(dev, "SYSTEMD_ALIAS", &alias) < 0) + return 0; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&alias, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_ALIAS property: %m"); + + if (!path_is_absolute(word)) + log_device_warning(dev, "SYSTEMD_ALIAS is not an absolute path, ignoring: %s", word); + else if (!path_is_normalized(word)) + log_device_warning(dev, "SYSTEMD_ALIAS is not a normalized path, ignoring: %s", word); + else + (void) device_setup_unit(m, dev, word, false); + } + + return 0; +} + +static void device_found_changed(Device *d, DeviceFound previous, DeviceFound now) { + assert(d); + + /* Didn't exist before, but does now? if so, generate a new invocation ID for it */ + if (previous == DEVICE_NOT_FOUND && now != DEVICE_NOT_FOUND) + (void) unit_acquire_invocation_id(UNIT(d)); + + if (FLAGS_SET(now, DEVICE_FOUND_UDEV)) + /* When the device is known to udev we consider it plugged. */ + device_set_state(d, DEVICE_PLUGGED); + else if (now != DEVICE_NOT_FOUND && !FLAGS_SET(previous, DEVICE_FOUND_UDEV)) + /* If the device has not been seen by udev yet, but is now referenced by the kernel, then we assume the + * kernel knows it now, and udev might soon too. */ + device_set_state(d, DEVICE_TENTATIVE); + else + /* If nobody sees the device, or if the device was previously seen by udev and now is only referenced + * from the kernel, then we consider the device is gone, the kernel just hasn't noticed it yet. */ + device_set_state(d, DEVICE_DEAD); +} + +static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask) { + assert(d); + + if (MANAGER_IS_RUNNING(UNIT(d)->manager)) { + DeviceFound n, previous; + + /* When we are already running, then apply the new mask right-away, and trigger state changes + * right-away */ + + n = (d->found & ~mask) | (found & mask); + if (n == d->found) + return; + + previous = d->found; + d->found = n; + + device_found_changed(d, previous, n); + } else + /* We aren't running yet, let's apply the new mask to the shadow variable instead, which we'll apply as + * soon as we catch-up with the state. */ + d->enumerated_found = (d->enumerated_found & ~mask) | (found & mask); +} + +static void device_update_found_by_sysfs(Manager *m, const char *sysfs, DeviceFound found, DeviceFound mask) { + Device *d, *l, *n; + + assert(m); + assert(sysfs); + + if (mask == 0) + return; + + l = hashmap_get(m->devices_by_sysfs, sysfs); + LIST_FOREACH_SAFE(same_sysfs, d, n, l) + device_update_found_one(d, found, mask); +} + +static int device_update_found_by_name(Manager *m, const char *path, DeviceFound found, DeviceFound mask) { + _cleanup_free_ char *e = NULL; + Unit *u; + int r; + + assert(m); + assert(path); + + if (mask == 0) + return 0; + + r = unit_name_from_path(path, ".device", &e); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name from device path: %m"); + + u = manager_get_unit(m, e); + if (!u) + return 0; + + device_update_found_one(DEVICE(u), found, mask); + return 0; +} + +static bool device_is_ready(sd_device *dev) { + const char *ready; + + assert(dev); + + if (sd_device_get_property_value(dev, "SYSTEMD_READY", &ready) < 0) + return true; + + return parse_boolean(ready) != 0; +} + +static Unit *device_following(Unit *u) { + Device *d = DEVICE(u); + Device *other, *first = NULL; + + assert(d); + + if (startswith(u->id, "sys-")) + return NULL; + + /* Make everybody follow the unit that's named after the sysfs path */ + LIST_FOREACH_AFTER(same_sysfs, other, d) + if (startswith(UNIT(other)->id, "sys-")) + return UNIT(other); + + LIST_FOREACH_BEFORE(same_sysfs, other, d) { + if (startswith(UNIT(other)->id, "sys-")) + return UNIT(other); + + first = other; + } + + return UNIT(first); +} + +static int device_following_set(Unit *u, Set **_set) { + Device *d = DEVICE(u), *other; + _cleanup_set_free_ Set *set = NULL; + int r; + + assert(d); + assert(_set); + + if (LIST_JUST_US(same_sysfs, d)) { + *_set = NULL; + return 0; + } + + set = set_new(NULL); + if (!set) + return -ENOMEM; + + LIST_FOREACH_AFTER(same_sysfs, other, d) { + r = set_put(set, other); + if (r < 0) + return r; + } + + LIST_FOREACH_BEFORE(same_sysfs, other, d) { + r = set_put(set, other); + if (r < 0) + return r; + } + + *_set = TAKE_PTR(set); + return 1; +} + +static void device_shutdown(Manager *m) { + assert(m); + + m->device_monitor = sd_device_monitor_unref(m->device_monitor); + m->devices_by_sysfs = hashmap_free(m->devices_by_sysfs); +} + +static void device_enumerate(Manager *m) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *dev; + int r; + + assert(m); + + if (!m->device_monitor) { + r = sd_device_monitor_new(&m->device_monitor); + if (r < 0) { + log_error_errno(r, "Failed to allocate device monitor: %m"); + goto fail; + } + + /* This will fail if we are unprivileged, but that + * should not matter much, as user instances won't run + * during boot. */ + (void) sd_device_monitor_set_receive_buffer_size(m->device_monitor, 128*1024*1024); + + r = sd_device_monitor_filter_add_match_tag(m->device_monitor, "systemd"); + if (r < 0) { + log_error_errno(r, "Failed to add udev tag match: %m"); + goto fail; + } + + r = sd_device_monitor_attach_event(m->device_monitor, m->event); + if (r < 0) { + log_error_errno(r, "Failed to attach event to device monitor: %m"); + goto fail; + } + + r = sd_device_monitor_start(m->device_monitor, device_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to start device monitor: %m"); + goto fail; + } + } + + r = sd_device_enumerator_new(&e); + if (r < 0) { + log_error_errno(r, "Failed to allocate device enumerator: %m"); + goto fail; + } + + r = sd_device_enumerator_add_match_tag(e, "systemd"); + if (r < 0) { + log_error_errno(r, "Failed to set tag for device enumeration: %m"); + goto fail; + } + + FOREACH_DEVICE(e, dev) { + const char *sysfs; + + if (!device_is_ready(dev)) + continue; + + (void) device_process_new(m, dev); + + if (sd_device_get_syspath(dev, &sysfs) < 0) + continue; + + device_update_found_by_sysfs(m, sysfs, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV); + } + + return; + +fail: + device_shutdown(m); +} + +static void device_propagate_reload_by_sysfs(Manager *m, const char *sysfs) { + Device *d, *l, *n; + int r; + + assert(m); + assert(sysfs); + + l = hashmap_get(m->devices_by_sysfs, sysfs); + LIST_FOREACH_SAFE(same_sysfs, d, n, l) { + if (d->state == DEVICE_DEAD) + continue; + + r = manager_propagate_reload(m, UNIT(d), JOB_REPLACE, NULL); + if (r < 0) + log_warning_errno(r, "Failed to propagate reload, ignoring: %m"); + } +} + +static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata) { + Manager *m = userdata; + const char *action, *sysfs; + int r; + + assert(m); + assert(dev); + + r = sd_device_get_syspath(dev, &sysfs); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to get device sys path: %m"); + return 0; + } + + r = sd_device_get_property_value(dev, "ACTION", &action); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to get udev action string: %m"); + return 0; + } + + if (streq(action, "change")) + device_propagate_reload_by_sysfs(m, sysfs); + + /* A change event can signal that a device is becoming ready, in particular if + * the device is using the SYSTEMD_READY logic in udev + * so we need to reach the else block of the follwing if, even for change events */ + if (streq(action, "remove")) { + r = swap_process_device_remove(m, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process swap device remove event, ignoring: %m"); + + /* If we get notified that a device was removed by + * udev, then it's completely gone, hence unset all + * found bits */ + device_update_found_by_sysfs(m, sysfs, 0, DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP); + + } else if (device_is_ready(dev)) { + + (void) device_process_new(m, dev); + + r = swap_process_device_new(m, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process swap device new event, ignoring: %m"); + + manager_dispatch_load_queue(m); + + /* The device is found now, set the udev found bit */ + device_update_found_by_sysfs(m, sysfs, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV); + + } else { + /* The device is nominally around, but not ready for + * us. Hence unset the udev bit, but leave the rest + * around. */ + + device_update_found_by_sysfs(m, sysfs, 0, DEVICE_FOUND_UDEV); + } + + return 0; +} + +static bool device_supported(void) { + static int read_only = -1; + + /* If /sys is read-only we don't support device units, and any + * attempts to start one should fail immediately. */ + + if (read_only < 0) + read_only = path_is_read_only_fs("/sys"); + + return read_only <= 0; +} + +static int validate_node(Manager *m, const char *node, sd_device **ret) { + struct stat st; + int r; + + assert(m); + assert(node); + assert(ret); + + /* Validates a device node that showed up in /proc/swaps or /proc/self/mountinfo if it makes sense for us to + * track. Note that this validator is fine within missing device nodes, but not with badly set up ones! */ + + if (!path_startswith(node, "/dev")) { + *ret = NULL; + return 0; /* bad! */ + } + + if (stat(node, &st) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to stat() device node file %s: %m", node); + + *ret = NULL; + return 1; /* good! (though missing) */ + + } else { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + r = device_new_from_stat_rdev(&dev, &st); + if (r == -ENOENT) { + *ret = NULL; + return 1; /* good! (though missing) */ + } else if (r == -ENOTTY) { + *ret = NULL; + return 0; /* bad! (not a device node but some other kind of file system node) */ + } else if (r < 0) + return log_error_errno(r, "Failed to get udev device from devnum %u:%u: %m", major(st.st_rdev), minor(st.st_rdev)); + + *ret = TAKE_PTR(dev); + return 1; /* good! */ + } +} + +void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask) { + int r; + + assert(m); + assert(node); + + if (!device_supported()) + return; + + if (mask == 0) + return; + + /* This is called whenever we find a device referenced in /proc/swaps or /proc/self/mounts. Such a device might + * be mounted/enabled at a time where udev has not finished probing it yet, and we thus haven't learned about + * it yet. In this case we will set the device unit to "tentative" state. + * + * This takes a pair of DeviceFound flags parameters. The 'mask' parameter is a bit mask that indicates which + * bits of 'found' to copy into the per-device DeviceFound flags field. Thus, this function may be used to set + * and unset individual bits in a single call, while merging partially with previous state. */ + + if ((found & mask) != 0) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + /* If the device is known in the kernel and newly appeared, then we'll create a device unit for it, + * under the name referenced in /proc/swaps or /proc/self/mountinfo. But first, let's validate if + * everything is alright with the device node. */ + + r = validate_node(m, node, &dev); + if (r <= 0) + return; /* Don't create a device unit for this if the device node is borked. */ + + (void) device_setup_unit(m, dev, node, false); + } + + /* Update the device unit's state, should it exist */ + (void) device_update_found_by_name(m, node, found, mask); +} + +bool device_shall_be_bound_by(Unit *device, Unit *u) { + assert(device); + assert(u); + + if (u->type != UNIT_MOUNT) + return false; + + return DEVICE(device)->bind_mounts; +} + +const UnitVTable device_vtable = { + .object_size = sizeof(Device), + .sections = + "Unit\0" + "Device\0" + "Install\0", + + .gc_jobs = true, + + .init = device_init, + .done = device_done, + .load = device_load, + + .coldplug = device_coldplug, + .catchup = device_catchup, + + .serialize = device_serialize, + .deserialize_item = device_deserialize_item, + + .dump = device_dump, + + .active_state = device_active_state, + .sub_state_to_string = device_sub_state_to_string, + + .bus_vtable = bus_device_vtable, + + .following = device_following, + .following_set = device_following_set, + + .enumerate = device_enumerate, + .shutdown = device_shutdown, + .supported = device_supported, + + .status_message_formats = { + .starting_stopping = { + [0] = "Expecting device %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Found device %s.", + [JOB_TIMEOUT] = "Timed out waiting for device %s.", + }, + }, +}; diff --git a/src/core/device.h b/src/core/device.h new file mode 100644 index 0000000..3062be7 --- /dev/null +++ b/src/core/device.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "unit.h" + +typedef struct Device Device; + +/* A mask specifying where we have seen the device currently. This is a bitmask because the device might show up + * asynchronously from each other at various places. For example, in very common case a device might already be mounted + * before udev finished probing it (think: a script setting up a loopback block device, formatting it and mounting it + * in quick succession). Hence we need to track precisely where it is already visible and where not. */ +typedef enum DeviceFound { + DEVICE_NOT_FOUND = 0, + DEVICE_FOUND_UDEV = 1 << 0, /* The device has shown up in the udev database */ + DEVICE_FOUND_MOUNT = 1 << 1, /* The device has shown up in /proc/self/mountinfo */ + DEVICE_FOUND_SWAP = 1 << 2, /* The device has shown up in /proc/swaps */ + DEVICE_FOUND_MASK = DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP, +} DeviceFound; + +struct Device { + Unit meta; + + char *sysfs; + + /* In order to be able to distinguish dependencies on different device nodes we might end up creating multiple + * devices for the same sysfs path. We chain them up here. */ + LIST_FIELDS(struct Device, same_sysfs); + + DeviceState state, deserialized_state; + DeviceFound found, deserialized_found, enumerated_found; + + bool bind_mounts; + + /* The SYSTEMD_WANTS udev property for this device the last time we saw it */ + char **wants_property; +}; + +extern const UnitVTable device_vtable; + +void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask); +bool device_shall_be_bound_by(Unit *device, Unit *u); + +DEFINE_CAST(DEVICE, Device); diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c new file mode 100644 index 0000000..530df70 --- /dev/null +++ b/src/core/dynamic-user.c @@ -0,0 +1,824 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <grp.h> +#include <pwd.h> +#include <sys/file.h> + +#include "clean-ipc.h" +#include "dynamic-user.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "nscd-flush.h" +#include "parse-util.h" +#include "random-util.h" +#include "serialize.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "user-util.h" + +/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */ +#define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN) + +DEFINE_PRIVATE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user); + +static DynamicUser* dynamic_user_free(DynamicUser *d) { + if (!d) + return NULL; + + if (d->manager) + (void) hashmap_remove(d->manager->dynamic_users, d->name); + + safe_close_pair(d->storage_socket); + return mfree(d); +} + +static int dynamic_user_add(Manager *m, const char *name, int storage_socket[static 2], DynamicUser **ret) { + DynamicUser *d; + int r; + + assert(m); + assert(name); + assert(storage_socket); + + r = hashmap_ensure_allocated(&m->dynamic_users, &string_hash_ops); + if (r < 0) + return r; + + d = malloc0(offsetof(DynamicUser, name) + strlen(name) + 1); + if (!d) + return -ENOMEM; + + strcpy(d->name, name); + + d->storage_socket[0] = storage_socket[0]; + d->storage_socket[1] = storage_socket[1]; + + r = hashmap_put(m->dynamic_users, d->name, d); + if (r < 0) { + free(d); + return r; + } + + d->manager = m; + + if (ret) + *ret = d; + + return 0; +} + +static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) { + _cleanup_close_pair_ int storage_socket[2] = { -1, -1 }; + DynamicUser *d; + int r; + + assert(m); + assert(name); + + /* Return the DynamicUser structure for a specific user name. Note that this won't actually allocate a UID for + * it, but just prepare the data structure for it. The UID is allocated only on demand, when it's really + * needed, and in the child process we fork off, since allocation involves NSS checks which are not OK to do + * from PID 1. To allow the children and PID 1 share information about allocated UIDs we use an anonymous + * AF_UNIX/SOCK_DGRAM socket (called the "storage socket") that contains at most one datagram with the + * allocated UID number, plus an fd referencing the lock file for the UID + * (i.e. /run/systemd/dynamic-uid/$UID). Why involve the socket pair? So that PID 1 and all its children can + * share the same storage for the UID and lock fd, simply by inheriting the storage socket fds. The socket pair + * may exist in three different states: + * + * a) no datagram stored. This is the initial state. In this case the dynamic user was never realized. + * + * b) a datagram containing a UID stored, but no lock fd attached to it. In this case there was already a + * statically assigned UID by the same name, which we are reusing. + * + * c) a datagram containing a UID stored, and a lock fd is attached to it. In this case we allocated a dynamic + * UID and locked it in the file system, using the lock fd. + * + * As PID 1 and various children might access the socket pair simultaneously, and pop the datagram or push it + * back in any time, we also maintain a lock on the socket pair. Note one peculiarity regarding locking here: + * the UID lock on disk is protected via a BSD file lock (i.e. an fd-bound lock), so that the lock is kept in + * place as long as there's a reference to the fd open. The lock on the storage socket pair however is a POSIX + * file lock (i.e. a process-bound lock), as all users share the same fd of this (after all it is anonymous, + * nobody else could get any access to it except via our own fd) and we want to synchronize access between all + * processes that have access to it. */ + + d = hashmap_get(m->dynamic_users, name); + if (d) { + if (ret) { + /* We already have a structure for the dynamic user, let's increase the ref count and reuse it */ + d->n_ref++; + *ret = d; + } + return 0; + } + + if (!valid_user_group_name_or_id(name)) + return -EINVAL; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, storage_socket) < 0) + return -errno; + + r = dynamic_user_add(m, name, storage_socket, &d); + if (r < 0) + return r; + + storage_socket[0] = storage_socket[1] = -1; + + if (ret) { + d->n_ref++; + *ret = d; + } + + return 1; +} + +static int make_uid_symlinks(uid_t uid, const char *name, bool b) { + + char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1]; + const char *path2; + int r = 0, k; + + /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The + * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it + * would be its own client then). We hence keep these world-readable symlinks in place, so that the + * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go + * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks + * on them and as those may be taken by any user with read access we can't make them world-readable. */ + + xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid); + if (unlink(path1) < 0 && errno != ENOENT) + r = -errno; + + if (b && symlink(name, path1) < 0) { + k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path1); + if (r == 0) + r = k; + } + + path2 = strjoina("/run/systemd/dynamic-uid/direct:", name); + if (unlink(path2) < 0 && errno != ENOENT) { + k = -errno; + if (r == 0) + r = k; + } + + if (b && symlink(path1 + STRLEN("/run/systemd/dynamic-uid/direct:"), path2) < 0) { + k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path2); + if (r == 0) + r = k; + } + + return r; +} + +static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) { + + /* Find a suitable free UID. We use the following strategy to find a suitable UID: + * + * 1. Initially, we try to read the UID of a number of specified paths. If any of these UIDs works, we use + * them. We use in order to increase the chance of UID reuse, if StateDirectory=, CacheDirectory= or + * LogsDirectory= are used, as reusing the UID these directories are owned by saves us from having to + * recursively chown() them to new users. + * + * 2. If that didn't yield a currently unused UID, we hash the user name, and try to use that. This should be + * pretty good, as the use ris by default derived from the unit name, and hence the same service and same + * user should usually get the same UID as long as our hashing doesn't clash. + * + * 3. Finally, if that didn't work, we randomly pick UIDs, until we find one that is empty. + * + * Since the dynamic UID space is relatively small we'll stop trying after 100 iterations, giving up. */ + + enum { + PHASE_SUGGESTED, /* the first phase, reusing directory ownership UIDs */ + PHASE_HASHED, /* the second phase, deriving a UID from the username by hashing */ + PHASE_RANDOM, /* the last phase, randomly picking UIDs */ + } phase = PHASE_SUGGESTED; + + static const uint8_t hash_key[] = { + 0x37, 0x53, 0x7e, 0x31, 0xcf, 0xce, 0x48, 0xf5, + 0x8a, 0xbb, 0x39, 0x57, 0x8d, 0xd9, 0xec, 0x59 + }; + + unsigned n_tries = 100, current_suggested = 0; + int r; + + (void) mkdir("/run/systemd/dynamic-uid", 0755); + + for (;;) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_close_ int lock_fd = -1; + uid_t candidate; + ssize_t l; + + if (--n_tries <= 0) /* Give up retrying eventually */ + return -EBUSY; + + switch (phase) { + + case PHASE_SUGGESTED: { + struct stat st; + + if (!suggested_paths || !suggested_paths[current_suggested]) { + /* We reached the end of the suggested paths list, let's try by hashing the name */ + phase = PHASE_HASHED; + continue; + } + + if (stat(suggested_paths[current_suggested++], &st) < 0) + continue; /* We can't read the UID of this path, but that doesn't matter, just try the next */ + + candidate = st.st_uid; + break; + } + + case PHASE_HASHED: + /* A static user by this name does not exist yet. Let's find a free ID then, and use that. We + * start with a UID generated as hash from the user name. */ + candidate = UID_CLAMP_INTO_RANGE(siphash24(name, strlen(name), hash_key)); + + /* If this one fails, we should proceed with random tries */ + phase = PHASE_RANDOM; + break; + + case PHASE_RANDOM: + + /* Pick another random UID, and see if that works for us. */ + random_bytes(&candidate, sizeof(candidate)); + candidate = UID_CLAMP_INTO_RANGE(candidate); + break; + + default: + assert_not_reached("unknown phase"); + } + + /* Make sure whatever we picked here actually is in the right range */ + if (!uid_is_dynamic(candidate)) + continue; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, candidate); + + for (;;) { + struct stat st; + + lock_fd = open(lock_path, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600); + if (lock_fd < 0) + return -errno; + + r = flock(lock_fd, LOCK_EX|LOCK_NB); /* Try to get a BSD file lock on the UID lock file */ + if (r < 0) { + if (IN_SET(errno, EBUSY, EAGAIN)) + goto next; /* already in use */ + + return -errno; + } + + if (fstat(lock_fd, &st) < 0) + return -errno; + if (st.st_nlink > 0) + break; + + /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and + * got the lock. Close it, and try again. */ + lock_fd = safe_close(lock_fd); + } + + /* Some superficial check whether this UID/GID might already be taken by some static user */ + if (getpwuid(candidate) || + getgrgid((gid_t) candidate) || + search_ipc(candidate, (gid_t) candidate) != 0) { + (void) unlink(lock_path); + continue; + } + + /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */ + l = pwritev(lock_fd, + (struct iovec[2]) { + IOVEC_INIT_STRING(name), + IOVEC_INIT((char[1]) { '\n' }, 1), + }, 2, 0); + if (l < 0) { + r = -errno; + (void) unlink(lock_path); + return r; + } + + (void) ftruncate(lock_fd, l); + (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */ + + *ret_uid = candidate; + return TAKE_FD(lock_fd); + + next: + ; + } +} + +static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) { + uid_t uid = UID_INVALID; + struct iovec iov = IOVEC_INIT(&uid, sizeof(uid)); + int lock_fd; + ssize_t k; + + assert(d); + assert(ret_uid); + assert(ret_lock_fd); + + /* Read the UID and lock fd that is stored in the storage AF_UNIX socket. This should be called with the lock + * on the socket taken. */ + + k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd); + if (k < 0) + return (int) k; + + *ret_uid = uid; + *ret_lock_fd = lock_fd; + + return 0; +} + +static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) { + struct iovec iov = IOVEC_INIT(&uid, sizeof(uid)); + + assert(d); + + /* Store the UID and lock_fd in the storage socket. This should be called with the socket pair lock taken. */ + return send_one_fd_iov(d->storage_socket[1], lock_fd, &iov, 1, MSG_DONTWAIT); +} + +static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + + if (lock_fd < 0) + return; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); + (void) unlink(lock_path); + + (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */ +} + +static int lockfp(int fd, int *fd_lock) { + if (lockf(fd, F_LOCK, 0) < 0) + return -errno; + *fd_lock = fd; + return 0; +} + +static void unlockfp(int *fd_lock) { + if (*fd_lock < 0) + return; + lockf(*fd_lock, F_ULOCK, 0); + *fd_lock = -1; +} + +static int dynamic_user_realize( + DynamicUser *d, + char **suggested_dirs, + uid_t *ret_uid, gid_t *ret_gid, + bool is_user) { + + _cleanup_(unlockfp) int storage_socket0_lock = -1; + _cleanup_close_ int uid_lock_fd = -1; + _cleanup_close_ int etc_passwd_lock_fd = -1; + uid_t num = UID_INVALID; /* a uid if is_user, and a gid otherwise */ + gid_t gid = GID_INVALID; /* a gid if is_user, ignored otherwise */ + bool flush_cache = false; + int r; + + assert(d); + assert(is_user == !!ret_uid); + assert(ret_gid); + + /* Acquire a UID for the user name. This will allocate a UID for the user name if the user doesn't exist + * yet. If it already exists its existing UID/GID will be reused. */ + + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) + return r; + + r = dynamic_user_pop(d, &num, &uid_lock_fd); + if (r < 0) { + int new_uid_lock_fd; + uid_t new_uid; + + if (r != -EAGAIN) + return r; + + /* OK, nothing stored yet, let's try to find something useful. While we are working on this release the + * lock however, so that nobody else blocks on our NSS lookups. */ + unlockfp(&storage_socket0_lock); + + /* Let's see if a proper, static user or group by this name exists. Try to take the lock on + * /etc/passwd, if that fails with EROFS then /etc is read-only. In that case it's fine if we don't + * take the lock, given that users can't be added there anyway in this case. */ + etc_passwd_lock_fd = take_etc_passwd_lock(NULL); + if (etc_passwd_lock_fd < 0 && etc_passwd_lock_fd != -EROFS) + return etc_passwd_lock_fd; + + /* First, let's parse this as numeric UID */ + r = parse_uid(d->name, &num); + if (r < 0) { + struct passwd *p; + struct group *g; + + if (is_user) { + /* OK, this is not a numeric UID. Let's see if there's a user by this name */ + p = getpwnam(d->name); + if (p) { + num = p->pw_uid; + gid = p->pw_gid; + } else { + /* if the user does not exist but the group with the same name exists, refuse operation */ + g = getgrnam(d->name); + if (g) + return -EILSEQ; + } + } else { + /* Let's see if there's a group by this name */ + g = getgrnam(d->name); + if (g) + num = (uid_t) g->gr_gid; + else { + /* if the group does not exist but the user with the same name exists, refuse operation */ + p = getpwnam(d->name); + if (p) + return -EILSEQ; + } + } + } + + if (num == UID_INVALID) { + /* No static UID assigned yet, excellent. Let's pick a new dynamic one, and lock it. */ + + uid_lock_fd = pick_uid(suggested_dirs, d->name, &num); + if (uid_lock_fd < 0) + return uid_lock_fd; + } + + /* So, we found a working UID/lock combination. Let's see if we actually still need it. */ + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) { + unlink_uid_lock(uid_lock_fd, num, d->name); + return r; + } + + r = dynamic_user_pop(d, &new_uid, &new_uid_lock_fd); + if (r < 0) { + if (r != -EAGAIN) { + /* OK, something bad happened, let's get rid of the bits we acquired. */ + unlink_uid_lock(uid_lock_fd, num, d->name); + return r; + } + + /* Great! Nothing is stored here, still. Store our newly acquired data. */ + flush_cache = true; + } else { + /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we + * acquired, and use what's stored now. */ + + unlink_uid_lock(uid_lock_fd, num, d->name); + safe_close(uid_lock_fd); + + num = new_uid; + uid_lock_fd = new_uid_lock_fd; + } + } else if (is_user && !uid_is_dynamic(num)) { + struct passwd *p; + + /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */ + errno = 0; + p = getpwuid(num); + if (!p) + return errno > 0 ? -errno : -ESRCH; + + gid = p->pw_gid; + } + + /* If the UID/GID was already allocated dynamically, push the data we popped out back in. If it was already + * allocated statically, push the UID back too, but do not push the lock fd in. If we allocated the UID + * dynamically right here, push that in along with the lock fd for it. */ + r = dynamic_user_push(d, num, uid_lock_fd); + if (r < 0) + return r; + + if (flush_cache) { + /* If we allocated a new dynamic UID, refresh nscd, so that it forgets about potentially cached + * negative entries. But let's do so after we release the /etc/passwd lock, so that there's no + * potential for nscd wanting to lock that for completing the invalidation. */ + etc_passwd_lock_fd = safe_close(etc_passwd_lock_fd); + (void) nscd_flush_cache(STRV_MAKE("passwd", "group")); + } + + if (is_user) { + *ret_uid = num; + *ret_gid = gid != GID_INVALID ? gid : num; + } else + *ret_gid = num; + + return 0; +} + +int dynamic_user_current(DynamicUser *d, uid_t *ret) { + _cleanup_(unlockfp) int storage_socket0_lock = -1; + _cleanup_close_ int lock_fd = -1; + uid_t uid; + int r; + + assert(d); + assert(ret); + + /* Get the currently assigned UID for the user, if there's any. This simply pops the data from the storage socket, and pushes it back in right-away. */ + + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) + return r; + + r = dynamic_user_pop(d, &uid, &lock_fd); + if (r < 0) + return r; + + r = dynamic_user_push(d, uid, lock_fd); + if (r < 0) + return r; + + *ret = uid; + return 0; +} + +static DynamicUser* dynamic_user_unref(DynamicUser *d) { + if (!d) + return NULL; + + /* Note that this doesn't actually release any resources itself. If a dynamic user should be fully destroyed + * and its UID released, use dynamic_user_destroy() instead. NB: the dynamic user table may contain entries + * with no references, which is commonly the case right before a daemon reload. */ + + assert(d->n_ref > 0); + d->n_ref--; + + return NULL; +} + +static int dynamic_user_close(DynamicUser *d) { + _cleanup_(unlockfp) int storage_socket0_lock = -1; + _cleanup_close_ int lock_fd = -1; + uid_t uid; + int r; + + /* Release the user ID, by releasing the lock on it, and emptying the storage socket. After this the user is + * unrealized again, much like it was after it the DynamicUser object was first allocated. */ + + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) + return r; + + r = dynamic_user_pop(d, &uid, &lock_fd); + if (r == -EAGAIN) + /* User wasn't realized yet, nothing to do. */ + return 0; + if (r < 0) + return r; + + /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */ + unlink_uid_lock(lock_fd, uid, d->name); + + (void) nscd_flush_cache(STRV_MAKE("passwd", "group")); + return 1; +} + +static DynamicUser* dynamic_user_destroy(DynamicUser *d) { + if (!d) + return NULL; + + /* Drop a reference to a DynamicUser object, and destroy the user completely if this was the last + * reference. This is called whenever a service is shut down and wants its dynamic UID gone. Note that + * dynamic_user_unref() is what is called whenever a service is simply freed, for example during a reload + * cycle, where the dynamic users should not be destroyed, but our datastructures should. */ + + dynamic_user_unref(d); + + if (d->n_ref > 0) + return NULL; + + (void) dynamic_user_close(d); + return dynamic_user_free(d); +} + +int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds) { + DynamicUser *d; + Iterator i; + + assert(m); + assert(f); + assert(fds); + + /* Dump the dynamic user database into the manager serialization, to deal with daemon reloads. */ + + HASHMAP_FOREACH(d, m->dynamic_users, i) { + int copy0, copy1; + + copy0 = fdset_put_dup(fds, d->storage_socket[0]); + if (copy0 < 0) + return log_error_errno(copy0, "Failed to add dynamic user storage fd to serialization: %m"); + + copy1 = fdset_put_dup(fds, d->storage_socket[1]); + if (copy1 < 0) + return log_error_errno(copy1, "Failed to add dynamic user storage fd to serialization: %m"); + + (void) serialize_item_format(f, "dynamic-user", "%s %i %i", d->name, copy0, copy1); + } + + return 0; +} + +void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds) { + _cleanup_free_ char *name = NULL, *s0 = NULL, *s1 = NULL; + int r, fd0, fd1; + + assert(m); + assert(value); + assert(fds); + + /* Parse the serialization again, after a daemon reload */ + + r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL); + if (r != 3 || !isempty(value)) { + log_debug("Unable to parse dynamic user line."); + return; + } + + if (safe_atoi(s0, &fd0) < 0 || !fdset_contains(fds, fd0)) { + log_debug("Unable to process dynamic user fd specification."); + return; + } + + if (safe_atoi(s1, &fd1) < 0 || !fdset_contains(fds, fd1)) { + log_debug("Unable to process dynamic user fd specification."); + return; + } + + r = dynamic_user_add(m, name, (int[]) { fd0, fd1 }, NULL); + if (r < 0) { + log_debug_errno(r, "Failed to add dynamic user: %m"); + return; + } + + (void) fdset_remove(fds, fd0); + (void) fdset_remove(fds, fd1); +} + +void dynamic_user_vacuum(Manager *m, bool close_user) { + DynamicUser *d; + Iterator i; + + assert(m); + + /* Empty the dynamic user database, optionally cleaning up orphaned dynamic users, i.e. destroy and free users + * to which no reference exist. This is called after a daemon reload finished, in order to destroy users which + * might not be referenced anymore. */ + + HASHMAP_FOREACH(d, m->dynamic_users, i) { + if (d->n_ref > 0) + continue; + + if (close_user) { + log_debug("Removing orphaned dynamic user %s", d->name); + (void) dynamic_user_close(d); + } + + dynamic_user_free(d); + } +} + +int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_free_ char *user = NULL; + uid_t check_uid; + int r; + + assert(m); + assert(ret); + + /* A friendly way to translate a dynamic user's UID into a name. */ + if (!uid_is_dynamic(uid)) + return -ESRCH; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); + r = read_one_line_file(lock_path, &user); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + /* The lock file might be stale, hence let's verify the data before we return it */ + r = dynamic_user_lookup_name(m, user, &check_uid); + if (r < 0) + return r; + if (check_uid != uid) /* lock file doesn't match our own idea */ + return -ESRCH; + + *ret = TAKE_PTR(user); + + return 0; +} + +int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) { + DynamicUser *d; + int r; + + assert(m); + assert(name); + assert(ret); + + /* A friendly call for translating a dynamic user's name into its UID */ + + d = hashmap_get(m->dynamic_users, name); + if (!d) + return -ESRCH; + + r = dynamic_user_current(d, ret); + if (r == -EAGAIN) /* not realized yet? */ + return -ESRCH; + + return r; +} + +int dynamic_creds_acquire(DynamicCreds *creds, Manager *m, const char *user, const char *group) { + bool acquired = false; + int r; + + assert(creds); + assert(m); + + /* A DynamicUser object encapsulates an allocation of both a UID and a GID for a specific name. However, some + * services use different user and groups. For cases like that there's DynamicCreds containing a pair of user + * and group. This call allocates a pair. */ + + if (!creds->user && user) { + r = dynamic_user_acquire(m, user, &creds->user); + if (r < 0) + return r; + + acquired = true; + } + + if (!creds->group) { + + if (creds->user && (!group || streq_ptr(user, group))) + creds->group = dynamic_user_ref(creds->user); + else { + r = dynamic_user_acquire(m, group, &creds->group); + if (r < 0) { + if (acquired) + creds->user = dynamic_user_unref(creds->user); + return r; + } + } + } + + return 0; +} + +int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid) { + uid_t u = UID_INVALID; + gid_t g = GID_INVALID; + int r; + + assert(creds); + assert(uid); + assert(gid); + + /* Realize both the referenced user and group */ + + if (creds->user) { + r = dynamic_user_realize(creds->user, suggested_paths, &u, &g, true); + if (r < 0) + return r; + } + + if (creds->group && creds->group != creds->user) { + r = dynamic_user_realize(creds->group, suggested_paths, NULL, &g, false); + if (r < 0) + return r; + } + + *uid = u; + *gid = g; + return 0; +} + +void dynamic_creds_unref(DynamicCreds *creds) { + assert(creds); + + creds->user = dynamic_user_unref(creds->user); + creds->group = dynamic_user_unref(creds->group); +} + +void dynamic_creds_destroy(DynamicCreds *creds) { + assert(creds); + + creds->user = dynamic_user_destroy(creds->user); + creds->group = dynamic_user_destroy(creds->group); +} diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h new file mode 100644 index 0000000..112f91e --- /dev/null +++ b/src/core/dynamic-user.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct DynamicUser DynamicUser; + +typedef struct DynamicCreds { + /* A combination of a dynamic user and group */ + DynamicUser *user; + DynamicUser *group; +} DynamicCreds; + +#include "manager.h" + +/* Note that this object always allocates a pair of user and group under the same name, even if one of them isn't + * used. This means, if you want to allocate a group and user pair, and they might have two different names, then you + * need to allocated two of these objects. DynamicCreds below makes that easy. */ +struct DynamicUser { + unsigned n_ref; + Manager *manager; + + /* An AF_UNIX socket pair that contains a datagram containing both the numeric ID assigned, as well as a lock + * file fd locking the user ID we picked. */ + int storage_socket[2]; + + char name[]; +}; + +int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds); +void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds); +void dynamic_user_vacuum(Manager *m, bool close_user); + +int dynamic_user_current(DynamicUser *d, uid_t *ret); +int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret); +int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret); + +int dynamic_creds_acquire(DynamicCreds *creds, Manager *m, const char *user, const char *group); +int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid); + +void dynamic_creds_unref(DynamicCreds *creds); +void dynamic_creds_destroy(DynamicCreds *creds); diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c new file mode 100644 index 0000000..f98b0de --- /dev/null +++ b/src/core/emergency-action.c @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <sys/reboot.h> + +#include "bus-error.h" +#include "bus-util.h" +#include "emergency-action.h" +#include "raw-reboot.h" +#include "reboot-util.h" +#include "special.h" +#include "string-table.h" +#include "terminal-util.h" +#include "virt.h" + +static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) { + log_full(warn ? LOG_WARNING : LOG_DEBUG, "%s: %s", message, reason); + if (warn) + manager_status_printf(m, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, + "%s: %s", message, reason); +} + +int emergency_action( + Manager *m, + EmergencyAction action, + EmergencyActionFlags options, + const char *reboot_arg, + int exit_status, + const char *reason) { + + assert(m); + assert(action >= 0); + assert(action < _EMERGENCY_ACTION_MAX); + + if (action == EMERGENCY_ACTION_NONE) + return -ECANCELED; + + if (FLAGS_SET(options, EMERGENCY_ACTION_IS_WATCHDOG) && !m->service_watchdogs) { + log_warning("Watchdog disabled! Not acting on: %s", reason); + return -ECANCELED; + } + + bool warn = FLAGS_SET(options, EMERGENCY_ACTION_WARN); + + switch (action) { + + case EMERGENCY_ACTION_REBOOT: + log_and_status(m, warn, "Rebooting", reason); + + (void) update_reboot_parameter_and_warn(reboot_arg); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL); + + break; + + case EMERGENCY_ACTION_REBOOT_FORCE: + log_and_status(m, warn, "Forcibly rebooting", reason); + + (void) update_reboot_parameter_and_warn(reboot_arg); + m->objective = MANAGER_REBOOT; + + break; + + case EMERGENCY_ACTION_REBOOT_IMMEDIATE: + log_and_status(m, warn, "Rebooting immediately", reason); + + sync(); + + if (!isempty(reboot_arg)) { + log_info("Rebooting with argument '%s'.", reboot_arg); + (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, reboot_arg); + log_warning_errno(errno, "Failed to reboot with parameter, retrying without: %m"); + } + + log_info("Rebooting."); + (void) reboot(RB_AUTOBOOT); + break; + + case EMERGENCY_ACTION_EXIT: + + if (exit_status >= 0) + m->return_value = exit_status; + + if (MANAGER_IS_USER(m) || detect_container() > 0) { + log_and_status(m, warn, "Exiting", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL); + break; + } + + log_notice("Doing \"poweroff\" action instead of an \"exit\" emergency action."); + _fallthrough_; + + case EMERGENCY_ACTION_POWEROFF: + log_and_status(m, warn, "Powering off", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL); + break; + + case EMERGENCY_ACTION_EXIT_FORCE: + + if (exit_status >= 0) + m->return_value = exit_status; + + if (MANAGER_IS_USER(m) || detect_container() > 0) { + log_and_status(m, warn, "Exiting immediately", reason); + m->objective = MANAGER_EXIT; + break; + } + + log_notice("Doing \"poweroff-force\" action instead of an \"exit-force\" emergency action."); + _fallthrough_; + + case EMERGENCY_ACTION_POWEROFF_FORCE: + log_and_status(m, warn, "Forcibly powering off", reason); + m->objective = MANAGER_POWEROFF; + break; + + case EMERGENCY_ACTION_POWEROFF_IMMEDIATE: + log_and_status(m, warn, "Powering off immediately", reason); + + sync(); + + log_info("Powering off."); + (void) reboot(RB_POWER_OFF); + break; + + default: + assert_not_reached("Unknown emergency action"); + } + + return -ECANCELED; +} + +static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = { + [EMERGENCY_ACTION_NONE] = "none", + [EMERGENCY_ACTION_REBOOT] = "reboot", + [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force", + [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate", + [EMERGENCY_ACTION_POWEROFF] = "poweroff", + [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force", + [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate", + [EMERGENCY_ACTION_EXIT] = "exit", + [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force", +}; +DEFINE_STRING_TABLE_LOOKUP(emergency_action, EmergencyAction); + +int parse_emergency_action( + const char *value, + bool system, + EmergencyAction *ret) { + + EmergencyAction x; + + x = emergency_action_from_string(value); + if (x < 0) + return -EINVAL; + + if (!system && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION) + return -EOPNOTSUPP; + + *ret = x; + return 0; +} diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h new file mode 100644 index 0000000..6e6c69d --- /dev/null +++ b/src/core/emergency-action.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef enum EmergencyAction { + EMERGENCY_ACTION_NONE, + EMERGENCY_ACTION_REBOOT, + EMERGENCY_ACTION_REBOOT_FORCE, + EMERGENCY_ACTION_REBOOT_IMMEDIATE, + EMERGENCY_ACTION_POWEROFF, + EMERGENCY_ACTION_POWEROFF_FORCE, + EMERGENCY_ACTION_POWEROFF_IMMEDIATE, + EMERGENCY_ACTION_EXIT, + _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT, + EMERGENCY_ACTION_EXIT_FORCE, + _EMERGENCY_ACTION_MAX, + _EMERGENCY_ACTION_INVALID = -1 +} EmergencyAction; + +typedef enum EmergencyActionFlags { + EMERGENCY_ACTION_IS_WATCHDOG = 1 << 0, + EMERGENCY_ACTION_WARN = 1 << 1, +} EmergencyActionFlags; + +#include "macro.h" +#include "manager.h" + +int emergency_action(Manager *m, + EmergencyAction action, EmergencyActionFlags options, + const char *reboot_arg, int exit_status, const char *reason); + +const char* emergency_action_to_string(EmergencyAction i) _const_; +EmergencyAction emergency_action_from_string(const char *s) _pure_; + +int parse_emergency_action(const char *value, bool system, EmergencyAction *ret); diff --git a/src/core/execute.c b/src/core/execute.c new file mode 100644 index 0000000..a708231 --- /dev/null +++ b/src/core/execute.c @@ -0,0 +1,5290 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <fcntl.h> +#include <glob.h> +#include <grp.h> +#include <poll.h> +#include <signal.h> +#include <string.h> +#include <sys/capability.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <sys/personality.h> +#include <sys/prctl.h> +#include <sys/shm.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> +#include <unistd.h> +#include <utmpx.h> + +#if HAVE_PAM +#include <security/pam_appl.h> +#endif + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif + +#if HAVE_SECCOMP +#include <seccomp.h> +#endif + +#if HAVE_APPARMOR +#include <sys/apparmor.h> +#endif + +#include "sd-messages.h" + +#include "af-list.h" +#include "alloc-util.h" +#if HAVE_APPARMOR +#include "apparmor-util.h" +#endif +#include "async.h" +#include "barrier.h" +#include "cap-list.h" +#include "capability-util.h" +#include "chown-recursive.h" +#include "cpu-set-util.h" +#include "def.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-list.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "io-util.h" +#include "ioprio.h" +#include "label.h" +#include "log.h" +#include "macro.h" +#include "manager.h" +#include "missing.h" +#include "mkdir.h" +#include "namespace.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "securebits-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "smack-util.h" +#include "socket-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "umask-util.h" +#include "unit.h" +#include "user-util.h" +#include "util.h" +#include "utmp-wtmp.h" + +#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC) +#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC) + +/* This assumes there is a 'tty' group */ +#define TTY_MODE 0620 + +#define SNDBUF_SIZE (8*1024*1024) + +static int shift_fds(int fds[], size_t n_fds) { + int start, restart_from; + + if (n_fds <= 0) + return 0; + + /* Modifies the fds array! (sorts it) */ + + assert(fds); + + start = 0; + for (;;) { + int i; + + restart_from = -1; + + for (i = start; i < (int) n_fds; i++) { + int nfd; + + /* Already at right index? */ + if (fds[i] == i+3) + continue; + + nfd = fcntl(fds[i], F_DUPFD, i + 3); + if (nfd < 0) + return -errno; + + safe_close(fds[i]); + fds[i] = nfd; + + /* Hmm, the fd we wanted isn't free? Then + * let's remember that and try again from here */ + if (nfd != i+3 && restart_from < 0) + restart_from = i; + } + + if (restart_from < 0) + break; + + start = restart_from; + } + + return 0; +} + +static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) { + size_t i, n_fds; + int r; + + n_fds = n_socket_fds + n_storage_fds; + if (n_fds <= 0) + return 0; + + assert(fds); + + /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags. + * O_NONBLOCK only applies to socket activation though. */ + + for (i = 0; i < n_fds; i++) { + + if (i < n_socket_fds) { + r = fd_nonblock(fds[i], nonblock); + if (r < 0) + return r; + } + + /* We unconditionally drop FD_CLOEXEC from the fds, + * since after all we want to pass these fds to our + * children */ + + r = fd_cloexec(fds[i], false); + if (r < 0) + return r; + } + + return 0; +} + +static const char *exec_context_tty_path(const ExecContext *context) { + assert(context); + + if (context->stdio_as_fds) + return NULL; + + if (context->tty_path) + return context->tty_path; + + return "/dev/console"; +} + +static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) { + const char *path; + + assert(context); + + path = exec_context_tty_path(context); + + if (context->tty_vhangup) { + if (p && p->stdin_fd >= 0) + (void) terminal_vhangup_fd(p->stdin_fd); + else if (path) + (void) terminal_vhangup(path); + } + + if (context->tty_reset) { + if (p && p->stdin_fd >= 0) + (void) reset_terminal_fd(p->stdin_fd, true); + else if (path) + (void) reset_terminal(path); + } + + if (context->tty_vt_disallocate && path) + (void) vt_disallocate(path); +} + +static bool is_terminal_input(ExecInput i) { + return IN_SET(i, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL); +} + +static bool is_terminal_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_SYSLOG_AND_CONSOLE, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE); +} + +static bool is_syslog_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_SYSLOG, + EXEC_OUTPUT_SYSLOG_AND_CONSOLE); +} + +static bool is_kmsg_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_KMSG_AND_CONSOLE); +} + +static bool exec_context_needs_term(const ExecContext *c) { + assert(c); + + /* Return true if the execution context suggests we should set $TERM to something useful. */ + + if (is_terminal_input(c->std_input)) + return true; + + if (is_terminal_output(c->std_output)) + return true; + + if (is_terminal_output(c->std_error)) + return true; + + return !!c->tty_path; +} + +static int open_null_as(int flags, int nfd) { + int fd; + + assert(nfd >= 0); + + fd = open("/dev/null", flags|O_NOCTTY); + if (fd < 0) + return -errno; + + return move_fd(fd, nfd, false); +} + +static int connect_journal_socket(int fd, uid_t uid, gid_t gid) { + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/journal/stdout", + }; + uid_t olduid = UID_INVALID; + gid_t oldgid = GID_INVALID; + int r; + + if (gid_is_valid(gid)) { + oldgid = getgid(); + + if (setegid(gid) < 0) + return -errno; + } + + if (uid_is_valid(uid)) { + olduid = getuid(); + + if (seteuid(uid) < 0) { + r = -errno; + goto restore_gid; + } + } + + r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0; + + /* If we fail to restore the uid or gid, things will likely + fail later on. This should only happen if an LSM interferes. */ + + if (uid_is_valid(uid)) + (void) seteuid(olduid); + + restore_gid: + if (gid_is_valid(gid)) + (void) setegid(oldgid); + + return r; +} + +static int connect_logger_as( + const Unit *unit, + const ExecContext *context, + const ExecParameters *params, + ExecOutput output, + const char *ident, + int nfd, + uid_t uid, + gid_t gid) { + + _cleanup_close_ int fd = -1; + int r; + + assert(context); + assert(params); + assert(output < _EXEC_OUTPUT_MAX); + assert(ident); + assert(nfd >= 0); + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -errno; + + r = connect_journal_socket(fd, uid, gid); + if (r < 0) + return r; + + if (shutdown(fd, SHUT_RD) < 0) + return -errno; + + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + if (dprintf(fd, + "%s\n" + "%s\n" + "%i\n" + "%i\n" + "%i\n" + "%i\n" + "%i\n", + context->syslog_identifier ?: ident, + params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "", + context->syslog_priority, + !!context->syslog_level_prefix, + is_syslog_output(output), + is_kmsg_output(output), + is_terminal_output(output)) < 0) + return -errno; + + return move_fd(TAKE_FD(fd), nfd, false); +} + +static int open_terminal_as(const char *path, int flags, int nfd) { + int fd; + + assert(path); + assert(nfd >= 0); + + fd = open_terminal(path, flags | O_NOCTTY); + if (fd < 0) + return fd; + + return move_fd(fd, nfd, false); +} + +static int acquire_path(const char *path, int flags, mode_t mode) { + union sockaddr_union sa = {}; + _cleanup_close_ int fd = -1; + int r, salen; + + assert(path); + + if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR)) + flags |= O_CREAT; + + fd = open(path, flags|O_NOCTTY, mode); + if (fd >= 0) + return TAKE_FD(fd); + + if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */ + return -errno; + if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */ + return -ENXIO; + + /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */ + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -errno; + + salen = sockaddr_un_set_path(&sa.un, path); + if (salen < 0) + return salen; + + if (connect(fd, &sa.sa, salen) < 0) + return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have + * indication that his wasn't an AF_UNIX socket after all */ + + if ((flags & O_ACCMODE) == O_RDONLY) + r = shutdown(fd, SHUT_WR); + else if ((flags & O_ACCMODE) == O_WRONLY) + r = shutdown(fd, SHUT_RD); + else + return TAKE_FD(fd); + if (r < 0) + return -errno; + + return TAKE_FD(fd); +} + +static int fixup_input( + const ExecContext *context, + int socket_fd, + bool apply_tty_stdin) { + + ExecInput std_input; + + assert(context); + + std_input = context->std_input; + + if (is_terminal_input(std_input) && !apply_tty_stdin) + return EXEC_INPUT_NULL; + + if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0) + return EXEC_INPUT_NULL; + + if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0) + return EXEC_INPUT_NULL; + + return std_input; +} + +static int fixup_output(ExecOutput std_output, int socket_fd) { + + if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0) + return EXEC_OUTPUT_INHERIT; + + return std_output; +} + +static int setup_input( + const ExecContext *context, + const ExecParameters *params, + int socket_fd, + int named_iofds[3]) { + + ExecInput i; + + assert(context); + assert(params); + + if (params->stdin_fd >= 0) { + if (dup2(params->stdin_fd, STDIN_FILENO) < 0) + return -errno; + + /* Try to make this the controlling tty, if it is a tty, and reset it */ + if (isatty(STDIN_FILENO)) { + (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE); + (void) reset_terminal_fd(STDIN_FILENO, true); + } + + return STDIN_FILENO; + } + + i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); + + switch (i) { + + case EXEC_INPUT_NULL: + return open_null_as(O_RDONLY, STDIN_FILENO); + + case EXEC_INPUT_TTY: + case EXEC_INPUT_TTY_FORCE: + case EXEC_INPUT_TTY_FAIL: { + int fd; + + fd = acquire_terminal(exec_context_tty_path(context), + i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY : + i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE : + ACQUIRE_TERMINAL_WAIT, + USEC_INFINITY); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + case EXEC_INPUT_SOCKET: + assert(socket_fd >= 0); + + return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; + + case EXEC_INPUT_NAMED_FD: + assert(named_iofds[STDIN_FILENO] >= 0); + + (void) fd_nonblock(named_iofds[STDIN_FILENO], false); + return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; + + case EXEC_INPUT_DATA: { + int fd; + + fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + case EXEC_INPUT_FILE: { + bool rw; + int fd; + + assert(context->stdio_file[STDIN_FILENO]); + + rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) || + (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO])); + + fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + default: + assert_not_reached("Unknown input type"); + } +} + +static bool can_inherit_stderr_from_stdout( + const ExecContext *context, + ExecOutput o, + ExecOutput e) { + + assert(context); + + /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the + * stderr fd */ + + if (e == EXEC_OUTPUT_INHERIT) + return true; + if (e != o) + return false; + + if (e == EXEC_OUTPUT_NAMED_FD) + return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]); + + if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND)) + return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]); + + return true; +} + +static int setup_output( + const Unit *unit, + const ExecContext *context, + const ExecParameters *params, + int fileno, + int socket_fd, + int named_iofds[3], + const char *ident, + uid_t uid, + gid_t gid, + dev_t *journal_stream_dev, + ino_t *journal_stream_ino) { + + ExecOutput o; + ExecInput i; + int r; + + assert(unit); + assert(context); + assert(params); + assert(ident); + assert(journal_stream_dev); + assert(journal_stream_ino); + + if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) { + + if (dup2(params->stdout_fd, STDOUT_FILENO) < 0) + return -errno; + + return STDOUT_FILENO; + } + + if (fileno == STDERR_FILENO && params->stderr_fd >= 0) { + if (dup2(params->stderr_fd, STDERR_FILENO) < 0) + return -errno; + + return STDERR_FILENO; + } + + i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); + o = fixup_output(context->std_output, socket_fd); + + if (fileno == STDERR_FILENO) { + ExecOutput e; + e = fixup_output(context->std_error, socket_fd); + + /* This expects the input and output are already set up */ + + /* Don't change the stderr file descriptor if we inherit all + * the way and are not on a tty */ + if (e == EXEC_OUTPUT_INHERIT && + o == EXEC_OUTPUT_INHERIT && + i == EXEC_INPUT_NULL && + !is_terminal_input(context->std_input) && + getppid () != 1) + return fileno; + + /* Duplicate from stdout if possible */ + if (can_inherit_stderr_from_stdout(context, o, e)) + return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno; + + o = e; + + } else if (o == EXEC_OUTPUT_INHERIT) { + /* If input got downgraded, inherit the original value */ + if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input)) + return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); + + /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */ + if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA)) + return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; + + /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */ + if (getppid() != 1) + return fileno; + + /* We need to open /dev/null here anew, to get the right access mode. */ + return open_null_as(O_WRONLY, fileno); + } + + switch (o) { + + case EXEC_OUTPUT_NULL: + return open_null_as(O_WRONLY, fileno); + + case EXEC_OUTPUT_TTY: + if (is_terminal_input(i)) + return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; + + /* We don't reset the terminal if this is just about output */ + return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); + + case EXEC_OUTPUT_SYSLOG: + case EXEC_OUTPUT_SYSLOG_AND_CONSOLE: + case EXEC_OUTPUT_KMSG: + case EXEC_OUTPUT_KMSG_AND_CONSOLE: + case EXEC_OUTPUT_JOURNAL: + case EXEC_OUTPUT_JOURNAL_AND_CONSOLE: + r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid); + if (r < 0) { + log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr"); + r = open_null_as(O_WRONLY, fileno); + } else { + struct stat st; + + /* If we connected this fd to the journal via a stream, patch the device/inode into the passed + * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits + * services to detect whether they are connected to the journal or not. + * + * If both stdout and stderr are connected to a stream then let's make sure to store the data + * about STDERR as that's usually the best way to do logging. */ + + if (fstat(fileno, &st) >= 0 && + (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) { + *journal_stream_dev = st.st_dev; + *journal_stream_ino = st.st_ino; + } + } + return r; + + case EXEC_OUTPUT_SOCKET: + assert(socket_fd >= 0); + + return dup2(socket_fd, fileno) < 0 ? -errno : fileno; + + case EXEC_OUTPUT_NAMED_FD: + assert(named_iofds[fileno] >= 0); + + (void) fd_nonblock(named_iofds[fileno], false); + return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno; + + case EXEC_OUTPUT_FILE: + case EXEC_OUTPUT_FILE_APPEND: { + bool rw; + int fd, flags; + + assert(context->stdio_file[fileno]); + + rw = context->std_input == EXEC_INPUT_FILE && + streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]); + + if (rw) + return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; + + flags = O_WRONLY; + if (o == EXEC_OUTPUT_FILE_APPEND) + flags |= O_APPEND; + + fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask); + if (fd < 0) + return fd; + + return move_fd(fd, fileno, 0); + } + + default: + assert_not_reached("Unknown error type"); + } +} + +static int chown_terminal(int fd, uid_t uid) { + struct stat st; + + assert(fd >= 0); + + /* Before we chown/chmod the TTY, let's ensure this is actually a tty */ + if (isatty(fd) < 1) + return 0; + + /* This might fail. What matters are the results. */ + (void) fchown(fd, uid, -1); + (void) fchmod(fd, TTY_MODE); + + if (fstat(fd, &st) < 0) + return -errno; + + if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE) + return -EPERM; + + return 0; +} + +static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) { + _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1; + int r; + + assert(_saved_stdin); + assert(_saved_stdout); + + saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3); + if (saved_stdin < 0) + return -errno; + + saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3); + if (saved_stdout < 0) + return -errno; + + fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC); + if (fd < 0) + return fd; + + r = chown_terminal(fd, getuid()); + if (r < 0) + return r; + + r = reset_terminal_fd(fd, true); + if (r < 0) + return r; + + r = rearrange_stdio(fd, fd, STDERR_FILENO); + fd = -1; + if (r < 0) + return r; + + *_saved_stdin = saved_stdin; + *_saved_stdout = saved_stdout; + + saved_stdin = saved_stdout = -1; + + return 0; +} + +static void write_confirm_error_fd(int err, int fd, const Unit *u) { + assert(err < 0); + + if (err == -ETIMEDOUT) + dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id); + else { + errno = -err; + dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id); + } +} + +static void write_confirm_error(int err, const char *vc, const Unit *u) { + _cleanup_close_ int fd = -1; + + assert(vc); + + fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return; + + write_confirm_error_fd(err, fd, u); +} + +static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) { + int r = 0; + + assert(saved_stdin); + assert(saved_stdout); + + release_terminal(); + + if (*saved_stdin >= 0) + if (dup2(*saved_stdin, STDIN_FILENO) < 0) + r = -errno; + + if (*saved_stdout >= 0) + if (dup2(*saved_stdout, STDOUT_FILENO) < 0) + r = -errno; + + *saved_stdin = safe_close(*saved_stdin); + *saved_stdout = safe_close(*saved_stdout); + + return r; +} + +enum { + CONFIRM_PRETEND_FAILURE = -1, + CONFIRM_PRETEND_SUCCESS = 0, + CONFIRM_EXECUTE = 1, +}; + +static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) { + int saved_stdout = -1, saved_stdin = -1, r; + _cleanup_free_ char *e = NULL; + char c; + + /* For any internal errors, assume a positive response. */ + r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout); + if (r < 0) { + write_confirm_error(r, vc, u); + return CONFIRM_EXECUTE; + } + + /* confirm_spawn might have been disabled while we were sleeping. */ + if (manager_is_confirm_spawn_disabled(u->manager)) { + r = 1; + goto restore_stdio; + } + + e = ellipsize(cmdline, 60, 100); + if (!e) { + log_oom(); + r = CONFIRM_EXECUTE; + goto restore_stdio; + } + + for (;;) { + r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e); + if (r < 0) { + write_confirm_error_fd(r, STDOUT_FILENO, u); + r = CONFIRM_EXECUTE; + goto restore_stdio; + } + + switch (c) { + case 'c': + printf("Resuming normal execution.\n"); + manager_disable_confirm_spawn(); + r = 1; + break; + case 'D': + unit_dump(u, stdout, " "); + continue; /* ask again */ + case 'f': + printf("Failing execution.\n"); + r = CONFIRM_PRETEND_FAILURE; + break; + case 'h': + printf(" c - continue, proceed without asking anymore\n" + " D - dump, show the state of the unit\n" + " f - fail, don't execute the command and pretend it failed\n" + " h - help\n" + " i - info, show a short summary of the unit\n" + " j - jobs, show jobs that are in progress\n" + " s - skip, don't execute the command and pretend it succeeded\n" + " y - yes, execute the command\n"); + continue; /* ask again */ + case 'i': + printf(" Description: %s\n" + " Unit: %s\n" + " Command: %s\n", + u->id, u->description, cmdline); + continue; /* ask again */ + case 'j': + manager_dump_jobs(u->manager, stdout, " "); + continue; /* ask again */ + case 'n': + /* 'n' was removed in favor of 'f'. */ + printf("Didn't understand 'n', did you mean 'f'?\n"); + continue; /* ask again */ + case 's': + printf("Skipping execution.\n"); + r = CONFIRM_PRETEND_SUCCESS; + break; + case 'y': + r = CONFIRM_EXECUTE; + break; + default: + assert_not_reached("Unhandled choice"); + } + break; + } + +restore_stdio: + restore_confirm_stdio(&saved_stdin, &saved_stdout); + return r; +} + +static int get_fixed_user(const ExecContext *c, const char **user, + uid_t *uid, gid_t *gid, + const char **home, const char **shell) { + int r; + const char *name; + + assert(c); + + if (!c->user) + return 0; + + /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway + * (i.e. are "/" or "/bin/nologin"). */ + + name = c->user; + r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN); + if (r < 0) + return r; + + *user = name; + return 0; +} + +static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) { + int r; + const char *name; + + assert(c); + + if (!c->group) + return 0; + + name = c->group; + r = get_group_creds(&name, gid, 0); + if (r < 0) + return r; + + *group = name; + return 0; +} + +static int get_supplementary_groups(const ExecContext *c, const char *user, + const char *group, gid_t gid, + gid_t **supplementary_gids, int *ngids) { + char **i; + int r, k = 0; + int ngroups_max; + bool keep_groups = false; + gid_t *groups = NULL; + _cleanup_free_ gid_t *l_gids = NULL; + + assert(c); + + /* + * If user is given, then lookup GID and supplementary groups list. + * We avoid NSS lookups for gid=0. Also we have to initialize groups + * here and as early as possible so we keep the list of supplementary + * groups of the caller. + */ + if (user && gid_is_valid(gid) && gid != 0) { + /* First step, initialize groups from /etc/groups */ + if (initgroups(user, gid) < 0) + return -errno; + + keep_groups = true; + } + + if (strv_isempty(c->supplementary_groups)) + return 0; + + /* + * If SupplementaryGroups= was passed then NGROUPS_MAX has to + * be positive, otherwise fail. + */ + errno = 0; + ngroups_max = (int) sysconf(_SC_NGROUPS_MAX); + if (ngroups_max <= 0) { + if (errno > 0) + return -errno; + else + return -EOPNOTSUPP; /* For all other values */ + } + + l_gids = new(gid_t, ngroups_max); + if (!l_gids) + return -ENOMEM; + + if (keep_groups) { + /* + * Lookup the list of groups that the user belongs to, we + * avoid NSS lookups here too for gid=0. + */ + k = ngroups_max; + if (getgrouplist(user, gid, l_gids, &k) < 0) + return -EINVAL; + } else + k = 0; + + STRV_FOREACH(i, c->supplementary_groups) { + const char *g; + + if (k >= ngroups_max) + return -E2BIG; + + g = *i; + r = get_group_creds(&g, l_gids+k, 0); + if (r < 0) + return r; + + k++; + } + + /* + * Sets ngids to zero to drop all supplementary groups, happens + * when we are under root and SupplementaryGroups= is empty. + */ + if (k == 0) { + *ngids = 0; + return 0; + } + + /* Otherwise get the final list of supplementary groups */ + groups = memdup(l_gids, sizeof(gid_t) * k); + if (!groups) + return -ENOMEM; + + *supplementary_gids = groups; + *ngids = k; + + groups = NULL; + + return 0; +} + +static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) { + int r; + + /* Handle SupplementaryGroups= if it is not empty */ + if (ngids > 0) { + r = maybe_setgroups(ngids, supplementary_gids); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + /* Then set our gids */ + if (setresgid(gid, gid, gid) < 0) + return -errno; + } + + return 0; +} + +static int enforce_user(const ExecContext *context, uid_t uid) { + assert(context); + + if (!uid_is_valid(uid)) + return 0; + + /* Sets (but doesn't look up) the uid and make sure we keep the + * capabilities while doing so. */ + + if (context->capability_ambient_set != 0) { + + /* First step: If we need to keep capabilities but + * drop privileges we need to make sure we keep our + * caps, while we drop privileges. */ + if (uid != 0) { + int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS; + + if (prctl(PR_GET_SECUREBITS) != sb) + if (prctl(PR_SET_SECUREBITS, sb) < 0) + return -errno; + } + } + + /* Second step: actually set the uids */ + if (setresuid(uid, uid, uid) < 0) + return -errno; + + /* At this point we should have all necessary capabilities but + are otherwise a normal user. However, the caps might got + corrupted due to the setresuid() so we need clean them up + later. This is done outside of this call. */ + + return 0; +} + +#if HAVE_PAM + +static int null_conv( + int num_msg, + const struct pam_message **msg, + struct pam_response **resp, + void *appdata_ptr) { + + /* We don't support conversations */ + + return PAM_CONV_ERR; +} + +#endif + +static int setup_pam( + const char *name, + const char *user, + uid_t uid, + gid_t gid, + const char *tty, + char ***env, + int fds[], size_t n_fds) { + +#if HAVE_PAM + + static const struct pam_conv conv = { + .conv = null_conv, + .appdata_ptr = NULL + }; + + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; + pam_handle_t *handle = NULL; + sigset_t old_ss; + int pam_code = PAM_SUCCESS, r; + char **nv, **e = NULL; + bool close_session = false; + pid_t pam_pid = 0, parent_pid; + int flags = 0; + + assert(name); + assert(user); + assert(env); + + /* We set up PAM in the parent process, then fork. The child + * will then stay around until killed via PR_GET_PDEATHSIG or + * systemd via the cgroup logic. It will then remove the PAM + * session again. The parent process will exec() the actual + * daemon. We do things this way to ensure that the main PID + * of the daemon is the one we initially fork()ed. */ + + r = barrier_create(&barrier); + if (r < 0) + goto fail; + + if (log_get_max_level() < LOG_DEBUG) + flags |= PAM_SILENT; + + pam_code = pam_start(name, user, &conv, &handle); + if (pam_code != PAM_SUCCESS) { + handle = NULL; + goto fail; + } + + if (!tty) { + _cleanup_free_ char *q = NULL; + + /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure + * out if that's the case, and read the TTY off it. */ + + if (getttyname_malloc(STDIN_FILENO, &q) >= 0) + tty = strjoina("/dev/", q); + } + + if (tty) { + pam_code = pam_set_item(handle, PAM_TTY, tty); + if (pam_code != PAM_SUCCESS) + goto fail; + } + + STRV_FOREACH(nv, *env) { + pam_code = pam_putenv(handle, *nv); + if (pam_code != PAM_SUCCESS) + goto fail; + } + + pam_code = pam_acct_mgmt(handle, flags); + if (pam_code != PAM_SUCCESS) + goto fail; + + pam_code = pam_open_session(handle, flags); + if (pam_code != PAM_SUCCESS) + goto fail; + + close_session = true; + + e = pam_getenvlist(handle); + if (!e) { + pam_code = PAM_BUF_ERR; + goto fail; + } + + /* Block SIGTERM, so that we know that it won't get lost in + * the child */ + + assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0); + + parent_pid = getpid_cached(); + + r = safe_fork("(sd-pam)", 0, &pam_pid); + if (r < 0) + goto fail; + if (r == 0) { + int sig, ret = EXIT_PAM; + + /* The child's job is to reset the PAM session on + * termination */ + barrier_set_role(&barrier, BARRIER_CHILD); + + /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds + * are open here that have been opened by PAM. */ + (void) close_many(fds, n_fds); + + /* Drop privileges - we don't need any to pam_close_session + * and this will make PR_SET_PDEATHSIG work in most cases. + * If this fails, ignore the error - but expect sd-pam threads + * to fail to exit normally */ + + r = maybe_setgroups(0, NULL); + if (r < 0) + log_warning_errno(r, "Failed to setgroups() in sd-pam: %m"); + if (setresgid(gid, gid, gid) < 0) + log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); + if (setresuid(uid, uid, uid) < 0) + log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m"); + + (void) ignore_signals(SIGPIPE, -1); + + /* Wait until our parent died. This will only work if + * the above setresuid() succeeds, otherwise the kernel + * will not allow unprivileged parents kill their privileged + * children this way. We rely on the control groups kill logic + * to do the rest for us. */ + if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0) + goto child_finish; + + /* Tell the parent that our setup is done. This is especially + * important regarding dropping privileges. Otherwise, unit + * setup might race against our setresuid(2) call. + * + * If the parent aborted, we'll detect this below, hence ignore + * return failure here. */ + (void) barrier_place(&barrier); + + /* Check if our parent process might already have died? */ + if (getppid() == parent_pid) { + sigset_t ss; + + assert_se(sigemptyset(&ss) >= 0); + assert_se(sigaddset(&ss, SIGTERM) >= 0); + + for (;;) { + if (sigwait(&ss, &sig) < 0) { + if (errno == EINTR) + continue; + + goto child_finish; + } + + assert(sig == SIGTERM); + break; + } + } + + /* If our parent died we'll end the session */ + if (getppid() != parent_pid) { + pam_code = pam_close_session(handle, flags); + if (pam_code != PAM_SUCCESS) + goto child_finish; + } + + ret = 0; + + child_finish: + pam_end(handle, pam_code | flags); + _exit(ret); + } + + barrier_set_role(&barrier, BARRIER_PARENT); + + /* If the child was forked off successfully it will do all the + * cleanups, so forget about the handle here. */ + handle = NULL; + + /* Unblock SIGTERM again in the parent */ + assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0); + + /* We close the log explicitly here, since the PAM modules + * might have opened it, but we don't want this fd around. */ + closelog(); + + /* Synchronously wait for the child to initialize. We don't care for + * errors as we cannot recover. However, warn loudly if it happens. */ + if (!barrier_place_and_sync(&barrier)) + log_error("PAM initialization failed"); + + return strv_free_and_replace(*env, e); + +fail: + if (pam_code != PAM_SUCCESS) { + log_error("PAM failed: %s", pam_strerror(handle, pam_code)); + r = -EPERM; /* PAM errors do not map to errno */ + } else + log_error_errno(r, "PAM failed: %m"); + + if (handle) { + if (close_session) + pam_code = pam_close_session(handle, flags); + + pam_end(handle, pam_code | flags); + } + + strv_free(e); + closelog(); + + return r; +#else + return 0; +#endif +} + +static void rename_process_from_path(const char *path) { + char process_name[11]; + const char *p; + size_t l; + + /* This resulting string must fit in 10 chars (i.e. the length + * of "/sbin/init") to look pretty in /bin/ps */ + + p = basename(path); + if (isempty(p)) { + rename_process("(...)"); + return; + } + + l = strlen(p); + if (l > 8) { + /* The end of the process name is usually more + * interesting, since the first bit might just be + * "systemd-" */ + p = p + l - 8; + l = 8; + } + + process_name[0] = '('; + memcpy(process_name+1, p, l); + process_name[1+l] = ')'; + process_name[1+l+1] = 0; + + rename_process(process_name); +} + +static bool context_has_address_families(const ExecContext *c) { + assert(c); + + return c->address_families_whitelist || + !set_isempty(c->address_families); +} + +static bool context_has_syscall_filters(const ExecContext *c) { + assert(c); + + return c->syscall_whitelist || + !hashmap_isempty(c->syscall_filter); +} + +static bool context_has_no_new_privileges(const ExecContext *c) { + assert(c); + + if (c->no_new_privileges) + return true; + + if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ + return false; + + /* We need NNP if we have any form of seccomp and are unprivileged */ + return context_has_address_families(c) || + c->memory_deny_write_execute || + c->restrict_realtime || + exec_context_restrict_namespaces_set(c) || + c->protect_kernel_tunables || + c->protect_kernel_modules || + c->private_devices || + context_has_syscall_filters(c) || + !set_isempty(c->syscall_archs) || + c->lock_personality; +} + +#if HAVE_SECCOMP + +static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { + + if (is_seccomp_available()) + return false; + + log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); + return true; +} + +static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) { + uint32_t negative_action, default_action, action; + int r; + + assert(u); + assert(c); + + if (!context_has_syscall_filters(c)) + return 0; + + if (skip_seccomp_unavailable(u, "SystemCallFilter=")) + return 0; + + negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); + + if (c->syscall_whitelist) { + default_action = negative_action; + action = SCMP_ACT_ALLOW; + } else { + default_action = SCMP_ACT_ALLOW; + action = negative_action; + } + + if (needs_ambient_hack) { + r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID); + if (r < 0) + return r; + } + + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false); +} + +static int apply_syscall_archs(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + if (set_isempty(c->syscall_archs)) + return 0; + + if (skip_seccomp_unavailable(u, "SystemCallArchitectures=")) + return 0; + + return seccomp_restrict_archs(c->syscall_archs); +} + +static int apply_address_families(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!context_has_address_families(c)) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) + return 0; + + return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist); +} + +static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->memory_deny_write_execute) + return 0; + + if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) + return 0; + + return seccomp_memory_deny_write_execute(); +} + +static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->restrict_realtime) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictRealtime=")) + return 0; + + return seccomp_restrict_realtime(); +} + +static int apply_protect_sysctl(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but + * let's protect even those systems where this is left on in the kernel. */ + + if (!c->protect_kernel_tunables) + return 0; + + if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) + return 0; + + return seccomp_protect_sysctl(); +} + +static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + /* Turn off module syscalls on ProtectKernelModules=yes */ + + if (!c->protect_kernel_modules) + return 0; + + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_private_devices(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ + + if (!c->private_devices) + return 0; + + if (skip_seccomp_unavailable(u, "PrivateDevices=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + if (!exec_context_restrict_namespaces_set(c)) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictNamespaces=")) + return 0; + + return seccomp_restrict_namespaces(c->restrict_namespaces); +} + +static int apply_lock_personality(const Unit* u, const ExecContext *c) { + unsigned long personality; + int r; + + assert(u); + assert(c); + + if (!c->lock_personality) + return 0; + + if (skip_seccomp_unavailable(u, "LockPersonality=")) + return 0; + + personality = c->personality; + + /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */ + if (personality == PERSONALITY_INVALID) { + + r = opinionated_personality(&personality); + if (r < 0) + return r; + } + + return seccomp_lock_personality(personality); +} + +#endif + +static void do_idle_pipe_dance(int idle_pipe[static 4]) { + assert(idle_pipe); + + idle_pipe[1] = safe_close(idle_pipe[1]); + idle_pipe[2] = safe_close(idle_pipe[2]); + + if (idle_pipe[0] >= 0) { + int r; + + r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC); + + if (idle_pipe[3] >= 0 && r == 0 /* timeout */) { + ssize_t n; + + /* Signal systemd that we are bored and want to continue. */ + n = write(idle_pipe[3], "x", 1); + if (n > 0) + /* Wait for systemd to react to the signal above. */ + fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC); + } + + idle_pipe[0] = safe_close(idle_pipe[0]); + + } + + idle_pipe[3] = safe_close(idle_pipe[3]); +} + +static const char *exec_directory_env_name_to_string(ExecDirectoryType t); + +static int build_environment( + const Unit *u, + const ExecContext *c, + const ExecParameters *p, + size_t n_fds, + const char *home, + const char *username, + const char *shell, + dev_t journal_stream_dev, + ino_t journal_stream_ino, + char ***ret) { + + _cleanup_strv_free_ char **our_env = NULL; + ExecDirectoryType t; + size_t n_env = 0; + char *x; + + assert(u); + assert(c); + assert(p); + assert(ret); + + our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX); + if (!our_env) + return -ENOMEM; + + if (n_fds > 0) { + _cleanup_free_ char *joined = NULL; + + if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + joined = strv_join(p->fd_names, ":"); + if (!joined) + return -ENOMEM; + + x = strjoin("LISTEN_FDNAMES=", joined); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) { + if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0) + return -ENOMEM; + our_env[n_env++] = x; + } + + /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic + * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but + * check the database directly. */ + if (p->flags & EXEC_NSS_BYPASS_BUS) { + x = strdup("SYSTEMD_NSS_BYPASS_BUS=1"); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (home) { + x = strappend("HOME=", home); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (username) { + x = strappend("LOGNAME=", username); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + + x = strappend("USER=", username); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (shell) { + x = strappend("SHELL=", shell); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (!sd_id128_is_null(u->invocation_id)) { + if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (exec_context_needs_term(c)) { + const char *tty_path, *term = NULL; + + tty_path = exec_context_tty_path(c); + + /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit + * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager + * passes to PID 1 ends up all the way in the console login shown. */ + + if (path_equal(tty_path, "/dev/console") && getppid() == 1) + term = getenv("TERM"); + if (!term) + term = default_term_for_tty(tty_path); + + x = strappend("TERM=", term); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (journal_stream_dev != 0 && journal_stream_ino != 0) { + if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + } + + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + _cleanup_free_ char *pre = NULL, *joined = NULL; + const char *n; + + if (!p->prefix[t]) + continue; + + if (strv_isempty(c->directories[t].paths)) + continue; + + n = exec_directory_env_name_to_string(t); + if (!n) + continue; + + pre = strjoin(p->prefix[t], "/"); + if (!pre) + return -ENOMEM; + + joined = strv_join_prefix(c->directories[t].paths, ":", pre); + if (!joined) + return -ENOMEM; + + x = strjoin(n, "=", joined); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + our_env[n_env++] = NULL; + assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX); + + *ret = TAKE_PTR(our_env); + + return 0; +} + +static int build_pass_environment(const ExecContext *c, char ***ret) { + _cleanup_strv_free_ char **pass_env = NULL; + size_t n_env = 0, n_bufsize = 0; + char **i; + + STRV_FOREACH(i, c->pass_environment) { + _cleanup_free_ char *x = NULL; + char *v; + + v = getenv(*i); + if (!v) + continue; + x = strjoin(*i, "=", v); + if (!x) + return -ENOMEM; + + if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2)) + return -ENOMEM; + + pass_env[n_env++] = TAKE_PTR(x); + pass_env[n_env] = NULL; + } + + *ret = TAKE_PTR(pass_env); + + return 0; +} + +static bool exec_needs_mount_namespace( + const ExecContext *context, + const ExecParameters *params, + const ExecRuntime *runtime) { + + assert(context); + assert(params); + + if (context->root_image) + return true; + + if (!strv_isempty(context->read_write_paths) || + !strv_isempty(context->read_only_paths) || + !strv_isempty(context->inaccessible_paths)) + return true; + + if (context->n_bind_mounts > 0) + return true; + + if (context->n_temporary_filesystems > 0) + return true; + + if (context->mount_flags != 0) + return true; + + if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir)) + return true; + + if (context->private_devices || + context->private_mounts || + context->protect_system != PROTECT_SYSTEM_NO || + context->protect_home != PROTECT_HOME_NO || + context->protect_kernel_tunables || + context->protect_kernel_modules || + context->protect_control_groups) + return true; + + if (context->root_directory) { + ExecDirectoryType t; + + if (context->mount_apivfs) + return true; + + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + if (!strv_isempty(context->directories[t].paths)) + return true; + } + } + + if (context->dynamic_user && + (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths))) + return true; + + return false; +} + +static int setup_private_users(uid_t uid, gid_t gid) { + _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; + _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 }; + _cleanup_close_ int unshare_ready_fd = -1; + _cleanup_(sigkill_waitp) pid_t pid = 0; + uint64_t c = 1; + ssize_t n; + int r; + + /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to + * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which + * we however lack after opening the user namespace. To work around this we fork() a temporary child process, + * which waits for the parent to create the new user namespace while staying in the original namespace. The + * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and + * continues execution normally. */ + + if (uid != 0 && uid_is_valid(uid)) { + r = asprintf(&uid_map, + "0 0 1\n" /* Map root → root */ + UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ + uid, uid); + if (r < 0) + return -ENOMEM; + } else { + uid_map = strdup("0 0 1\n"); /* The case where the above is the same */ + if (!uid_map) + return -ENOMEM; + } + + if (gid != 0 && gid_is_valid(gid)) { + r = asprintf(&gid_map, + "0 0 1\n" /* Map root → root */ + GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */ + gid, gid); + if (r < 0) + return -ENOMEM; + } else { + gid_map = strdup("0 0 1\n"); /* The case where the above is the same */ + if (!gid_map) + return -ENOMEM; + } + + /* Create a communication channel so that the parent can tell the child when it finished creating the user + * namespace. */ + unshare_ready_fd = eventfd(0, EFD_CLOEXEC); + if (unshare_ready_fd < 0) + return -errno; + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return -errno; + + r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid); + if (r < 0) + return r; + if (r == 0) { + _cleanup_close_ int fd = -1; + const char *a; + pid_t ppid; + + /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from + * here, after the parent opened its own user namespace. */ + + ppid = getppid(); + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Wait until the parent unshared the user namespace */ + if (read(unshare_ready_fd, &c, sizeof(c)) < 0) { + r = -errno; + goto child_fail; + } + + /* Disable the setgroups() system call in the child user namespace, for good. */ + a = procfs_file_alloca(ppid, "setgroups"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) { + r = -errno; + goto child_fail; + } + + /* If the file is missing the kernel is too old, let's continue anyway. */ + } else { + if (write(fd, "deny\n", 5) < 0) { + r = -errno; + goto child_fail; + } + + fd = safe_close(fd); + } + + /* First write the GID map */ + a = procfs_file_alloca(ppid, "gid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, gid_map, strlen(gid_map)) < 0) { + r = -errno; + goto child_fail; + } + fd = safe_close(fd); + + /* The write the UID map */ + a = procfs_file_alloca(ppid, "uid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, uid_map, strlen(uid_map)) < 0) { + r = -errno; + goto child_fail; + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + if (unshare(CLONE_NEWUSER) < 0) + return -errno; + + /* Let the child know that the namespace is ready now */ + if (write(unshare_ready_fd, &c, sizeof(c)) < 0) + return -errno; + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return -errno; + if (n == sizeof(r)) { /* an error code was sent to us */ + if (r < 0) + return r; + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate_and_check("(sd-userns)", pid, 0); + pid = 0; + if (r < 0) + return r; + if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */ + return -EIO; + + return 0; +} + +static int setup_exec_directory( + const ExecContext *context, + const ExecParameters *params, + uid_t uid, + gid_t gid, + ExecDirectoryType type, + int *exit_status) { + + static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY, + [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY, + [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY, + [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY, + [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY, + }; + char **rt; + int r; + + assert(context); + assert(params); + assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX); + assert(exit_status); + + if (!params->prefix[type]) + return 0; + + if (params->flags & EXEC_CHOWN_DIRECTORIES) { + if (!uid_is_valid(uid)) + uid = 0; + if (!gid_is_valid(gid)) + gid = 0; + } + + STRV_FOREACH(rt, context->directories[type].paths) { + _cleanup_free_ char *p = NULL, *pp = NULL; + + p = strjoin(params->prefix[type], "/", *rt); + if (!p) { + r = -ENOMEM; + goto fail; + } + + r = mkdir_parents_label(p, 0755); + if (r < 0) + goto fail; + + if (context->dynamic_user && + !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) { + _cleanup_free_ char *private_root = NULL; + + /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we + * want to avoid leaving a directory around fully accessible that is owned by a dynamic user + * whose UID is later on reused. To lock this down we use the same trick used by container + * managers to prohibit host users to get access to files of the same UID in containers: we + * place everything inside a directory that has an access mode of 0700 and is owned root:root, + * so that it acts as security boundary for unprivileged host code. We then use fs namespacing + * to make this directory permeable for the service itself. + * + * Specifically: for a service which wants a special directory "foo/" we first create a + * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of + * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way, + * privileged host users can access "foo/" as usual, but unprivileged host users can't look + * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally + * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus + * disabling the access boundary for the service and making sure it only gets access to the + * dirs it needs but no others. Tricky? Yes, absolutely, but it works! + * + * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be + * owned by the service itself. + * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing + * files or sockets with other services. */ + + private_root = strjoin(params->prefix[type], "/private"); + if (!private_root) { + r = -ENOMEM; + goto fail; + } + + /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */ + r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + pp = strjoin(private_root, "/", *rt); + if (!pp) { + r = -ENOMEM; + goto fail; + } + + /* Create all directories between the configured directory and this private root, and mark them 0755 */ + r = mkdir_parents_label(pp, 0755); + if (r < 0) + goto fail; + + if (is_dir(p, false) > 0 && + (laccess(pp, F_OK) < 0 && errno == ENOENT)) { + + /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move + * it over. Most likely the service has been upgraded from one that didn't use + * DynamicUser=1, to one that does. */ + + if (rename(p, pp) < 0) { + r = -errno; + goto fail; + } + } else { + /* Otherwise, create the actual directory for the service */ + + r = mkdir_label(pp, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + } + + /* And link it up from the original place */ + r = symlink_idempotent(pp, p, true); + if (r < 0) + goto fail; + + /* Lock down the access mode */ + if (chmod(pp, context->directories[type].mode) < 0) { + r = -errno; + goto fail; + } + } else { + r = mkdir_label(p, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + if (r == -EEXIST) { + struct stat st; + + if (stat(p, &st) < 0) { + r = -errno; + goto fail; + } + if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0) + log_warning("%s \'%s\' already exists but the mode is different. " + "(filesystem: %o %sMode: %o)", + exec_directory_type_to_string(type), *rt, + st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777); + if (!context->dynamic_user) + continue; + } + } + + /* Don't change the owner of the configuration directory, as in the common case it is not written to by + * a service, and shall not be writable. */ + if (type == EXEC_DIRECTORY_CONFIGURATION) + continue; + + /* Then, change the ownership of the whole tree, if necessary */ + r = path_chown_recursive(pp ?: p, uid, gid); + if (r < 0) + goto fail; + } + + return 0; + +fail: + *exit_status = exit_status_table[type]; + return r; +} + +#if ENABLE_SMACK +static int setup_smack( + const ExecContext *context, + const ExecCommand *command) { + + int r; + + assert(context); + assert(command); + + if (context->smack_process_label) { + r = mac_smack_apply_pid(0, context->smack_process_label); + if (r < 0) + return r; + } +#ifdef SMACK_DEFAULT_PROCESS_LABEL + else { + _cleanup_free_ char *exec_label = NULL; + + r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label); + if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP)) + return r; + + r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); + if (r < 0) + return r; + } +#endif + + return 0; +} +#endif + +static int compile_bind_mounts( + const ExecContext *context, + const ExecParameters *params, + BindMount **ret_bind_mounts, + size_t *ret_n_bind_mounts, + char ***ret_empty_directories) { + + _cleanup_strv_free_ char **empty_directories = NULL; + BindMount *bind_mounts; + size_t n, h = 0, i; + ExecDirectoryType t; + int r; + + assert(context); + assert(params); + assert(ret_bind_mounts); + assert(ret_n_bind_mounts); + assert(ret_empty_directories); + + n = context->n_bind_mounts; + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + n += strv_length(context->directories[t].paths); + } + + if (n <= 0) { + *ret_bind_mounts = NULL; + *ret_n_bind_mounts = 0; + *ret_empty_directories = NULL; + return 0; + } + + bind_mounts = new(BindMount, n); + if (!bind_mounts) + return -ENOMEM; + + for (i = 0; i < context->n_bind_mounts; i++) { + BindMount *item = context->bind_mounts + i; + char *s, *d; + + s = strdup(item->source); + if (!s) { + r = -ENOMEM; + goto finish; + } + + d = strdup(item->destination); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = item->read_only, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + } + + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **suffix; + + if (!params->prefix[t]) + continue; + + if (strv_isempty(context->directories[t].paths)) + continue; + + if (context->dynamic_user && + !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) && + !(context->root_directory || context->root_image)) { + char *private_root; + + /* So this is for a dynamic user, and we need to make sure the process can access its own + * directory. For that we overmount the usually inaccessible "private" subdirectory with a + * tmpfs that makes it accessible and is empty except for the submounts we do this for. */ + + private_root = strjoin(params->prefix[t], "/private"); + if (!private_root) { + r = -ENOMEM; + goto finish; + } + + r = strv_consume(&empty_directories, private_root); + if (r < 0) + goto finish; + } + + STRV_FOREACH(suffix, context->directories[t].paths) { + char *s, *d; + + if (context->dynamic_user && + !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) + s = strjoin(params->prefix[t], "/private/", *suffix); + else + s = strjoin(params->prefix[t], "/", *suffix); + if (!s) { + r = -ENOMEM; + goto finish; + } + + if (context->dynamic_user && + !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) && + (context->root_directory || context->root_image)) + /* When RootDirectory= or RootImage= are set, then the symbolic link to the private + * directory is not created on the root directory. So, let's bind-mount the directory + * on the 'non-private' place. */ + d = strjoin(params->prefix[t], "/", *suffix); + else + d = strdup(s); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = false, + .recursive = true, + .ignore_enoent = false, + }; + } + } + + assert(h == n); + + *ret_bind_mounts = bind_mounts; + *ret_n_bind_mounts = n; + *ret_empty_directories = TAKE_PTR(empty_directories); + + return (int) n; + +finish: + bind_mount_free_many(bind_mounts, h); + return r; +} + +static int apply_mount_namespace( + const Unit *u, + const ExecCommand *command, + const ExecContext *context, + const ExecParameters *params, + const ExecRuntime *runtime) { + + _cleanup_strv_free_ char **empty_directories = NULL; + char *tmp = NULL, *var = NULL; + const char *root_dir = NULL, *root_image = NULL; + NamespaceInfo ns_info; + bool needs_sandboxing; + BindMount *bind_mounts = NULL; + size_t n_bind_mounts = 0; + int r; + + assert(context); + + /* The runtime struct only contains the parent of the private /tmp, + * which is non-accessible to world users. Inside of it there's a /tmp + * that is sticky, and that's the one we want to use here. */ + + if (context->private_tmp && runtime) { + if (runtime->tmp_dir) + tmp = strjoina(runtime->tmp_dir, "/tmp"); + if (runtime->var_tmp_dir) + var = strjoina(runtime->var_tmp_dir, "/tmp"); + } + + if (params->flags & EXEC_APPLY_CHROOT) { + root_image = context->root_image; + + if (!root_image) + root_dir = context->root_directory; + } + + r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); + if (r < 0) + return r; + + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); + if (needs_sandboxing) + ns_info = (NamespaceInfo) { + .ignore_protect_paths = false, + .private_dev = context->private_devices, + .protect_control_groups = context->protect_control_groups, + .protect_kernel_tunables = context->protect_kernel_tunables, + .protect_kernel_modules = context->protect_kernel_modules, + .mount_apivfs = context->mount_apivfs, + .private_mounts = context->private_mounts, + }; + else if (!context->dynamic_user && root_dir) + /* + * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed + * sandbox info, otherwise enforce it, don't ignore protected paths and + * fail if we are enable to apply the sandbox inside the mount namespace. + */ + ns_info = (NamespaceInfo) { + .ignore_protect_paths = true, + }; + else + ns_info = (NamespaceInfo) {}; + + r = setup_namespace(root_dir, root_image, + &ns_info, context->read_write_paths, + needs_sandboxing ? context->read_only_paths : NULL, + needs_sandboxing ? context->inaccessible_paths : NULL, + empty_directories, + bind_mounts, + n_bind_mounts, + context->temporary_filesystems, + context->n_temporary_filesystems, + tmp, + var, + needs_sandboxing ? context->protect_home : PROTECT_HOME_NO, + needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO, + context->mount_flags, + DISSECT_IMAGE_DISCARD_ON_LOOP); + + bind_mount_free_many(bind_mounts, n_bind_mounts); + + /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports + * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively + * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a + * completely different execution environment. */ + if (r == -ENOANO) { + if (n_bind_mounts == 0 && + context->n_temporary_filesystems == 0 && + !root_dir && !root_image && + !context->dynamic_user) { + log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring."); + return 0; + } + + log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n" + "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s", + n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user)); + + return -EOPNOTSUPP; + } + + return r; +} + +static int apply_working_directory( + const ExecContext *context, + const ExecParameters *params, + const char *home, + const bool needs_mount_ns, + int *exit_status) { + + const char *d, *wd; + + assert(context); + assert(exit_status); + + if (context->working_directory_home) { + + if (!home) { + *exit_status = EXIT_CHDIR; + return -ENXIO; + } + + wd = home; + + } else if (context->working_directory) + wd = context->working_directory; + else + wd = "/"; + + if (params->flags & EXEC_APPLY_CHROOT) { + if (!needs_mount_ns && context->root_directory) + if (chroot(context->root_directory) < 0) { + *exit_status = EXIT_CHROOT; + return -errno; + } + + d = wd; + } else + d = prefix_roota(context->root_directory, wd); + + if (chdir(d) < 0 && !context->working_directory_missing_ok) { + *exit_status = EXIT_CHDIR; + return -errno; + } + + return 0; +} + +static int setup_keyring( + const Unit *u, + const ExecContext *context, + const ExecParameters *p, + uid_t uid, gid_t gid) { + + key_serial_t keyring; + int r = 0; + uid_t saved_uid; + gid_t saved_gid; + + assert(u); + assert(context); + assert(p); + + /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that + * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond + * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be + * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in + * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where + * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */ + + if (context->keyring_mode == EXEC_KEYRING_INHERIT) + return 0; + + /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up + * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel + * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user + * & group is just as nasty as acquiring a reference to the user keyring. */ + + saved_uid = getuid(); + saved_gid = getgid(); + + if (gid_is_valid(gid) && gid != saved_gid) { + if (setregid(gid, -1) < 0) + return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m"); + } + + if (uid_is_valid(uid) && uid != saved_uid) { + if (setreuid(uid, -1) < 0) { + r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m"); + goto out; + } + } + + keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0); + if (keyring == -1) { + if (errno == ENOSYS) + log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring."); + else if (IN_SET(errno, EACCES, EPERM)) + log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring."); + else if (errno == EDQUOT) + log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring."); + else + r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m"); + + goto out; + } + + /* When requested link the user keyring into the session keyring. */ + if (context->keyring_mode == EXEC_KEYRING_SHARED) { + + if (keyctl(KEYCTL_LINK, + KEY_SPEC_USER_KEYRING, + KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) { + r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m"); + goto out; + } + } + + /* Restore uid/gid back */ + if (uid_is_valid(uid) && uid != saved_uid) { + if (setreuid(saved_uid, -1) < 0) { + r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m"); + goto out; + } + } + + if (gid_is_valid(gid) && gid != saved_gid) { + if (setregid(saved_gid, -1) < 0) + return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m"); + } + + /* Populate they keyring with the invocation ID by default, as original saved_uid. */ + if (!sd_id128_is_null(u->invocation_id)) { + key_serial_t key; + + key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING); + if (key == -1) + log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m"); + else { + if (keyctl(KEYCTL_SETPERM, key, + KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH| + KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0) + r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m"); + } + } + +out: + /* Revert back uid & gid for the the last time, and exit */ + /* no extra logging, as only the first already reported error matters */ + if (getuid() != saved_uid) + (void) setreuid(saved_uid, -1); + + if (getgid() != saved_gid) + (void) setregid(saved_gid, -1); + + return r; +} + +static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) { + assert(array); + assert(n); + + if (!pair) + return; + + if (pair[0] >= 0) + array[(*n)++] = pair[0]; + if (pair[1] >= 0) + array[(*n)++] = pair[1]; +} + +static int close_remaining_fds( + const ExecParameters *params, + const ExecRuntime *runtime, + const DynamicCreds *dcreds, + int user_lookup_fd, + int socket_fd, + int exec_fd, + int *fds, size_t n_fds) { + + size_t n_dont_close = 0; + int dont_close[n_fds + 12]; + + assert(params); + + if (params->stdin_fd >= 0) + dont_close[n_dont_close++] = params->stdin_fd; + if (params->stdout_fd >= 0) + dont_close[n_dont_close++] = params->stdout_fd; + if (params->stderr_fd >= 0) + dont_close[n_dont_close++] = params->stderr_fd; + + if (socket_fd >= 0) + dont_close[n_dont_close++] = socket_fd; + if (exec_fd >= 0) + dont_close[n_dont_close++] = exec_fd; + if (n_fds > 0) { + memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); + n_dont_close += n_fds; + } + + if (runtime) + append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket); + + if (dcreds) { + if (dcreds->user) + append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket); + if (dcreds->group) + append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket); + } + + if (user_lookup_fd >= 0) + dont_close[n_dont_close++] = user_lookup_fd; + + return close_all_fds(dont_close, n_dont_close); +} + +static int send_user_lookup( + Unit *unit, + int user_lookup_fd, + uid_t uid, + gid_t gid) { + + assert(unit); + + /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID + * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was + * specified. */ + + if (user_lookup_fd < 0) + return 0; + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + if (writev(user_lookup_fd, + (struct iovec[]) { + IOVEC_INIT(&uid, sizeof(uid)), + IOVEC_INIT(&gid, sizeof(gid)), + IOVEC_INIT_STRING(unit->id) }, 3) < 0) + return -errno; + + return 0; +} + +static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) { + int r; + + assert(c); + assert(home); + assert(buf); + + /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */ + + if (*home) + return 0; + + if (!c->working_directory_home) + return 0; + + if (uid == 0) { + /* Hardcode /root as home directory for UID 0 */ + *home = "/root"; + return 1; + } + + r = get_home_dir(buf); + if (r < 0) + return r; + + *home = *buf; + return 1; +} + +static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) { + _cleanup_strv_free_ char ** list = NULL; + ExecDirectoryType t; + int r; + + assert(c); + assert(p); + assert(ret); + + assert(c->dynamic_user); + + /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for + * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special + * directories. */ + + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **i; + + if (t == EXEC_DIRECTORY_CONFIGURATION) + continue; + + if (!p->prefix[t]) + continue; + + STRV_FOREACH(i, c->directories[t].paths) { + char *e; + + if (t == EXEC_DIRECTORY_RUNTIME) + e = strjoin(p->prefix[t], "/", *i); + else + e = strjoin(p->prefix[t], "/private/", *i); + if (!e) + return -ENOMEM; + + r = strv_consume(&list, e); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(list); + + return 0; +} + +static char *exec_command_line(char **argv); + +static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) { + bool using_subcgroup; + char *p; + + assert(params); + assert(ret); + + if (!params->cgroup_path) + return -EINVAL; + + /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated + * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control + * processes started after the main unit's process in the unit's main cgroup because it is now an inner one, + * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process, + * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=, + * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre= + * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP + * flag, which is only passed for the former statements, not for the latter. */ + + using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL); + if (using_subcgroup) + p = strjoin(params->cgroup_path, "/.control"); + else + p = strdup(params->cgroup_path); + if (!p) + return -ENOMEM; + + *ret = p; + return using_subcgroup; +} + +static int exec_child( + Unit *unit, + const ExecCommand *command, + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + DynamicCreds *dcreds, + int socket_fd, + int named_iofds[3], + int *fds, + size_t n_socket_fds, + size_t n_storage_fds, + char **files_env, + int user_lookup_fd, + int *exit_status) { + + _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL; + int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; + _cleanup_free_ gid_t *supplementary_gids = NULL; + const char *username = NULL, *groupname = NULL; + _cleanup_free_ char *home_buffer = NULL; + const char *home = NULL, *shell = NULL; + dev_t journal_stream_dev = 0; + ino_t journal_stream_ino = 0; + bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */ + needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ + needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ + needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */ +#if HAVE_SELINUX + _cleanup_free_ char *mac_selinux_context_net = NULL; + bool use_selinux = false; +#endif +#if ENABLE_SMACK + bool use_smack = false; +#endif +#if HAVE_APPARMOR + bool use_apparmor = false; +#endif + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + size_t n_fds; + ExecDirectoryType dt; + int secure_bits; + + assert(unit); + assert(command); + assert(context); + assert(params); + assert(exit_status); + + rename_process_from_path(command->path); + + /* We reset exactly these signals, since they are the + * only ones we set to SIG_IGN in the main daemon. All + * others we leave untouched because we set them to + * SIG_DFL or a valid handler initially, both of which + * will be demoted to SIG_DFL. */ + (void) default_signals(SIGNALS_CRASH_HANDLER, + SIGNALS_IGNORE, -1); + + if (context->ignore_sigpipe) + (void) ignore_signals(SIGPIPE, -1); + + r = reset_signal_mask(); + if (r < 0) { + *exit_status = EXIT_SIGNAL_MASK; + return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m"); + } + + if (params->idle_pipe) + do_idle_pipe_dance(params->idle_pipe); + + /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its + * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have + * any fds open we don't really want open during the transition. In order to make logging work, we switch the + * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */ + + log_forget_fds(); + log_set_open_when_needed(true); + + /* In case anything used libc syslog(), close this here, too */ + closelog(); + + n_fds = n_socket_fds + n_storage_fds; + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); + } + + if (!context->same_pgrp) + if (setsid() < 0) { + *exit_status = EXIT_SETSID; + return log_unit_error_errno(unit, errno, "Failed to create new process session: %m"); + } + + exec_context_tty_reset(context, params); + + if (unit_shall_confirm_spawn(unit)) { + const char *vc = params->confirm_spawn; + _cleanup_free_ char *cmdline = NULL; + + cmdline = exec_command_line(command->argv); + if (!cmdline) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = ask_for_confirmation(vc, unit, cmdline); + if (r != CONFIRM_EXECUTE) { + if (r == CONFIRM_PRETEND_SUCCESS) { + *exit_status = EXIT_SUCCESS; + return 0; + } + *exit_status = EXIT_CONFIRM; + log_unit_error(unit, "Execution cancelled by the user"); + return -ECANCELED; + } + } + + /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is + * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note + * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS + * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they + * might internally call into other NSS modules that are involved in hostname resolution, we never know. */ + if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 || + setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) { + *exit_status = EXIT_MEMORY; + return log_unit_error_errno(unit, errno, "Failed to update environment: %m"); + } + + if (context->dynamic_user && dcreds) { + _cleanup_strv_free_ char **suggested_paths = NULL; + + /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS + * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/ + if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, errno, "Failed to update environment: %m"); + } + + r = compile_suggested_paths(context, params, &suggested_paths); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid); + if (r < 0) { + *exit_status = EXIT_USER; + if (r == -EILSEQ) { + log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists."); + return -EOPNOTSUPP; + } + return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m"); + } + + if (!uid_is_valid(uid)) { + *exit_status = EXIT_USER; + log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid); + return -ESRCH; + } + + if (!gid_is_valid(gid)) { + *exit_status = EXIT_USER; + log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid); + return -ESRCH; + } + + if (dcreds->user) + username = dcreds->user->name; + + } else { + r = get_fixed_user(context, &username, &uid, &gid, &home, &shell); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m"); + } + + r = get_fixed_group(context, &groupname, &gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m"); + } + } + + /* Initialize user supplementary groups and get SupplementaryGroups= ones */ + r = get_supplementary_groups(context, username, groupname, gid, + &supplementary_gids, &ngids); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m"); + } + + r = send_user_lookup(unit, user_lookup_fd, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m"); + } + + user_lookup_fd = safe_close(user_lookup_fd); + + r = acquire_home(context, uid, &home, &home_buffer); + if (r < 0) { + *exit_status = EXIT_CHDIR; + return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m"); + } + + /* If a socket is connected to STDIN/STDOUT/STDERR, we + * must sure to drop O_NONBLOCK */ + if (socket_fd >= 0) + (void) fd_nonblock(socket_fd, false); + + /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields. + * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */ + if (params->cgroup_path) { + _cleanup_free_ char *p = NULL; + + r = exec_parameters_get_cgroup_path(params, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); + } + + r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p); + } + } + + r = setup_input(context, params, socket_fd, named_iofds); + if (r < 0) { + *exit_status = EXIT_STDIN; + return log_unit_error_errno(unit, r, "Failed to set up standard input: %m"); + } + + r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + if (r < 0) { + *exit_status = EXIT_STDOUT; + return log_unit_error_errno(unit, r, "Failed to set up standard output: %m"); + } + + r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + if (r < 0) { + *exit_status = EXIT_STDERR; + return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m"); + } + + if (context->oom_score_adjust_set) { + /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces + * prohibit write access to this file, and we shouldn't trip up over that. */ + r = set_oom_score_adjust(context->oom_score_adjust); + if (IN_SET(r, -EPERM, -EACCES)) + log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m"); + else if (r < 0) { + *exit_status = EXIT_OOM_ADJUST; + return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m"); + } + } + + if (context->nice_set) + if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) { + *exit_status = EXIT_NICE; + return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m"); + } + + if (context->cpu_sched_set) { + struct sched_param param = { + .sched_priority = context->cpu_sched_priority, + }; + + r = sched_setscheduler(0, + context->cpu_sched_policy | + (context->cpu_sched_reset_on_fork ? + SCHED_RESET_ON_FORK : 0), + ¶m); + if (r < 0) { + *exit_status = EXIT_SETSCHEDULER; + return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m"); + } + } + + if (context->cpuset) + if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) { + *exit_status = EXIT_CPUAFFINITY; + return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m"); + } + + if (context->ioprio_set) + if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) { + *exit_status = EXIT_IOPRIO; + return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m"); + } + + if (context->timer_slack_nsec != NSEC_INFINITY) + if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) { + *exit_status = EXIT_TIMERSLACK; + return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m"); + } + + if (context->personality != PERSONALITY_INVALID) { + r = safe_personality(context->personality); + if (r < 0) { + *exit_status = EXIT_PERSONALITY; + return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m"); + } + } + + if (context->utmp_id) + utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0), + context->tty_path, + context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS : + context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS : + USER_PROCESS, + username); + + if (context->user) { + r = chown_terminal(STDIN_FILENO, uid); + if (r < 0) { + *exit_status = EXIT_STDIN; + return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m"); + } + } + + /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 + * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not + * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only + * touch a single hierarchy too. */ + if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) { + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m"); + } + } + + for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + r = setup_exec_directory(context, params, uid, gid, dt, exit_status); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]); + } + + r = build_environment( + unit, + context, + params, + n_fds, + home, + username, + shell, + journal_stream_dev, + journal_stream_ino, + &our_env); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = build_pass_environment(context, &pass_env); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + accum_env = strv_env_merge(5, + params->environment, + our_env, + pass_env, + context->environment, + files_env, + NULL); + if (!accum_env) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + accum_env = strv_env_clean(accum_env); + + (void) umask(context->umask); + + r = setup_keyring(unit, context, params, uid, gid); + if (r < 0) { + *exit_status = EXIT_KEYRING; + return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m"); + } + + /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */ + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); + + /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */ + needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported(); + + /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */ + if (needs_ambient_hack) + needs_setuid = false; + else + needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID)); + + if (needs_sandboxing) { + /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being + * present. The actual MAC context application will happen later, as late as possible, to avoid + * impacting our own code paths. */ + +#if HAVE_SELINUX + use_selinux = mac_selinux_use(); +#endif +#if ENABLE_SMACK + use_smack = mac_smack_use(); +#endif +#if HAVE_APPARMOR + use_apparmor = mac_apparmor_use(); +#endif + } + + if (needs_sandboxing) { + int which_failed; + + /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what + * is set here. (See below.) */ + + r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed); + if (r < 0) { + *exit_status = EXIT_LIMITS; + return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + } + } + + if (needs_setuid) { + + /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits + * wins here. (See above.) */ + + if (context->pam_name && username) { + r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); + if (r < 0) { + *exit_status = EXIT_PAM; + return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m"); + } + } + } + + if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) { + if (ns_type_supported(NAMESPACE_NET)) { + r = setup_netns(runtime->netns_storage_socket); + if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m"); + } + } else + log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); + } + + needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime); + if (needs_mount_namespace) { + r = apply_mount_namespace(unit, command, context, params, runtime); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m"); + } + } + + /* Drop groups as early as possbile */ + if (needs_setuid) { + r = enforce_groups(gid, supplementary_gids, ngids); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_unit_error_errno(unit, r, "Changing group credentials failed: %m"); + } + } + + if (needs_sandboxing) { +#if HAVE_SELINUX + if (use_selinux && params->selinux_context_net && socket_fd >= 0) { + r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m"); + } + } +#endif + + if (context->private_users) { + r = setup_private_users(uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m"); + } + } + } + + /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are + * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd + * however if we have it as we want to keep it open until the final execve(). */ + + if (params->exec_fd >= 0) { + exec_fd = params->exec_fd; + + if (exec_fd < 3 + (int) n_fds) { + int moved_fd; + + /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the + * process we are about to execute. */ + + moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); + if (moved_fd < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); + } + + safe_close(exec_fd); + exec_fd = moved_fd; + } else { + /* This fd should be FD_CLOEXEC already, but let's make sure. */ + r = fd_cloexec(exec_fd, true); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); + } + } + + fds_with_exec_fd = newa(int, n_fds + 1); + memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int)); + fds_with_exec_fd[n_fds] = exec_fd; + n_fds_with_exec_fd = n_fds + 1; + } else { + fds_with_exec_fd = fds; + n_fds_with_exec_fd = n_fds; + } + + r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); + if (r >= 0) + r = shift_fds(fds, n_fds); + if (r >= 0) + r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m"); + } + + /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off + * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined, + * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we + * came this far. */ + + secure_bits = context->secure_bits; + + if (needs_sandboxing) { + uint64_t bset; + + /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly + * requested. (Note this is placed after the general resource limit initialization, see + * above, in order to take precedence.) */ + if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) { + if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) { + *exit_status = EXIT_LIMITS; + return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m"); + } + } + +#if ENABLE_SMACK + /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the + * process. This is the latest place before dropping capabilities. Other MAC context are set later. */ + if (use_smack) { + r = setup_smack(context, command); + if (r < 0) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m"); + } + } +#endif + + bset = context->capability_bounding_set; + /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for + * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own, + * instead of us doing that */ + if (needs_ambient_hack) + bset |= (UINT64_C(1) << CAP_SETPCAP) | + (UINT64_C(1) << CAP_SETUID) | + (UINT64_C(1) << CAP_SETGID); + + if (!cap_test_all(bset)) { + r = capability_bounding_set_drop(bset, false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m"); + } + } + + /* This is done before enforce_user, but ambient set + * does not survive over setresuid() if keep_caps is not set. */ + if (!needs_ambient_hack && + context->capability_ambient_set != 0) { + r = capability_ambient_set_apply(context->capability_ambient_set, true); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m"); + } + } + } + + if (needs_setuid) { + if (context->user) { + r = enforce_user(context, uid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid); + } + + if (!needs_ambient_hack && + context->capability_ambient_set != 0) { + + /* Fix the ambient capabilities after user change. */ + r = capability_ambient_set_apply(context->capability_ambient_set, false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m"); + } + + /* If we were asked to change user and ambient capabilities + * were requested, we had to add keep-caps to the securebits + * so that we would maintain the inherited capability set + * through the setresuid(). Make sure that the bit is added + * also to the context secure_bits so that we don't try to + * drop the bit away next. */ + + secure_bits |= 1<<SECURE_KEEP_CAPS; + } + } + } + + /* Apply working directory here, because the working directory might be on NFS and only the user running + * this service might have the correct privilege to change to the working directory */ + r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status); + if (r < 0) + return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m"); + + if (needs_sandboxing) { + /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to + * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires + * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls + * are restricted. */ + +#if HAVE_SELINUX + if (use_selinux) { + char *exec_context = mac_selinux_context_net ?: context->selinux_context; + + if (exec_context) { + r = setexeccon(exec_context); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context); + } + } + } +#endif + +#if HAVE_APPARMOR + if (use_apparmor && context->apparmor_profile) { + r = aa_change_onexec(context->apparmor_profile); + if (r < 0 && !context->apparmor_profile_ignore) { + *exit_status = EXIT_APPARMOR_PROFILE; + return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile); + } + } +#endif + + /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs + * we'll try not to call PR_SET_SECUREBITS unless necessary. */ + if (prctl(PR_GET_SECUREBITS) != secure_bits) + if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) { + *exit_status = EXIT_SECUREBITS; + return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m"); + } + + if (context_has_no_new_privileges(context)) + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + *exit_status = EXIT_NO_NEW_PRIVILEGES; + return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m"); + } + +#if HAVE_SECCOMP + r = apply_address_families(unit, context); + if (r < 0) { + *exit_status = EXIT_ADDRESS_FAMILIES; + return log_unit_error_errno(unit, r, "Failed to restrict address families: %m"); + } + + r = apply_memory_deny_write_execute(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m"); + } + + r = apply_restrict_realtime(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m"); + } + + r = apply_restrict_namespaces(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m"); + } + + r = apply_protect_sysctl(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m"); + } + + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m"); + } + + r = apply_private_devices(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to set up private devices: %m"); + } + + r = apply_syscall_archs(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m"); + } + + r = apply_lock_personality(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to lock personalities: %m"); + } + + /* This really should remain the last step before the execve(), to make sure our own code is unaffected + * by the filter as little as possible. */ + r = apply_syscall_filter(unit, context, needs_ambient_hack); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m"); + } +#endif + } + + if (!strv_isempty(context->unset_environment)) { + char **ee = NULL; + + ee = strv_env_delete(accum_env, 1, context->unset_environment); + if (!ee) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + strv_free_and_replace(accum_env, ee); + } + + final_argv = replace_env_argv(command->argv, accum_env); + if (!final_argv) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *line; + + line = exec_command_line(final_argv); + if (line) + log_struct(LOG_DEBUG, + "EXECUTABLE=%s", command->path, + LOG_UNIT_MESSAGE(unit, "Executing: %s", line), + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit)); + } + + if (exec_fd >= 0) { + uint8_t hot = 1; + + /* We have finished with all our initializations. Let's now let the manager know that. From this point + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m"); + } + } + + execve(command->path, final_argv, accum_env); + r = -errno; + + if (exec_fd >= 0) { + uint8_t hot = 0; + + /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager + * that POLLHUP on it no longer means execve() succeeded. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m"); + } + } + + if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { + log_struct_errno(LOG_INFO, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), + LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m", + command->path), + "EXECUTABLE=%s", command->path); + return 0; + } + + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, r, "Failed to execute command: %m"); +} + +static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); +static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]); + +int exec_spawn(Unit *unit, + ExecCommand *command, + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + DynamicCreds *dcreds, + pid_t *ret) { + + int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL; + _cleanup_free_ char *subcgroup_path = NULL; + _cleanup_strv_free_ char **files_env = NULL; + size_t n_storage_fds = 0, n_socket_fds = 0; + _cleanup_free_ char *line = NULL; + pid_t pid; + + assert(unit); + assert(command); + assert(context); + assert(ret); + assert(params); + assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0)); + + if (context->std_input == EXEC_INPUT_SOCKET || + context->std_output == EXEC_OUTPUT_SOCKET || + context->std_error == EXEC_OUTPUT_SOCKET) { + + if (params->n_socket_fds > 1) { + log_unit_error(unit, "Got more than one socket."); + return -EINVAL; + } + + if (params->n_socket_fds == 0) { + log_unit_error(unit, "Got no socket."); + return -EINVAL; + } + + socket_fd = params->fds[0]; + } else { + socket_fd = -1; + fds = params->fds; + n_socket_fds = params->n_socket_fds; + n_storage_fds = params->n_storage_fds; + } + + r = exec_context_named_iofds(context, params, named_iofds); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m"); + + r = exec_context_load_environment(unit, context, &files_env); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to load environment files: %m"); + + line = exec_command_line(command->argv); + if (!line) + return log_oom(); + + log_struct(LOG_DEBUG, + LOG_UNIT_MESSAGE(unit, "About to execute: %s", line), + "EXECUTABLE=%s", command->path, + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit)); + + if (params->cgroup_path) { + r = exec_parameters_get_cgroup_path(params, &subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); + if (r > 0) { /* We are using a child cgroup */ + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); + } + } + + pid = fork(); + if (pid < 0) + return log_unit_error_errno(unit, errno, "Failed to fork: %m"); + + if (pid == 0) { + int exit_status = EXIT_SUCCESS; + + r = exec_child(unit, + command, + context, + params, + runtime, + dcreds, + socket_fd, + named_iofds, + fds, + n_socket_fds, + n_storage_fds, + files_env, + unit->manager->user_lookup_fds[1], + &exit_status); + + if (r < 0) + log_struct_errno(LOG_ERR, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), + LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m", + exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD), + command->path), + "EXECUTABLE=%s", command->path); + + _exit(exit_status); + } + + log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid); + + /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever + * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the + * process will be killed too). */ + if (subcgroup_path) + (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid); + + exec_status_start(&command->exec_status, pid); + + *ret = pid; + return 0; +} + +void exec_context_init(ExecContext *c) { + ExecDirectoryType i; + + assert(c); + + c->umask = 0022; + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0); + c->cpu_sched_policy = SCHED_OTHER; + c->syslog_priority = LOG_DAEMON|LOG_INFO; + c->syslog_level_prefix = true; + c->ignore_sigpipe = true; + c->timer_slack_nsec = NSEC_INFINITY; + c->personality = PERSONALITY_INVALID; + for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) + c->directories[i].mode = 0755; + c->capability_bounding_set = CAP_ALL; + assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL); + c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL; + c->log_level_max = -1; +} + +void exec_context_done(ExecContext *c) { + ExecDirectoryType i; + size_t l; + + assert(c); + + c->environment = strv_free(c->environment); + c->environment_files = strv_free(c->environment_files); + c->pass_environment = strv_free(c->pass_environment); + c->unset_environment = strv_free(c->unset_environment); + + rlimit_free_all(c->rlimit); + + for (l = 0; l < 3; l++) { + c->stdio_fdname[l] = mfree(c->stdio_fdname[l]); + c->stdio_file[l] = mfree(c->stdio_file[l]); + } + + c->working_directory = mfree(c->working_directory); + c->root_directory = mfree(c->root_directory); + c->root_image = mfree(c->root_image); + c->tty_path = mfree(c->tty_path); + c->syslog_identifier = mfree(c->syslog_identifier); + c->user = mfree(c->user); + c->group = mfree(c->group); + + c->supplementary_groups = strv_free(c->supplementary_groups); + + c->pam_name = mfree(c->pam_name); + + c->read_only_paths = strv_free(c->read_only_paths); + c->read_write_paths = strv_free(c->read_write_paths); + c->inaccessible_paths = strv_free(c->inaccessible_paths); + + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + + c->cpuset = cpu_set_mfree(c->cpuset); + + c->utmp_id = mfree(c->utmp_id); + c->selinux_context = mfree(c->selinux_context); + c->apparmor_profile = mfree(c->apparmor_profile); + c->smack_process_label = mfree(c->smack_process_label); + + c->syscall_filter = hashmap_free(c->syscall_filter); + c->syscall_archs = set_free(c->syscall_archs); + c->address_families = set_free(c->address_families); + + for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) + c->directories[i].paths = strv_free(c->directories[i].paths); + + c->log_level_max = -1; + + exec_context_free_log_extra_fields(c); + + c->log_rate_limit_interval_usec = 0; + c->log_rate_limit_burst = 0; + + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; +} + +int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) { + char **i; + + assert(c); + + if (!runtime_prefix) + return 0; + + STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) { + _cleanup_free_ char *p; + + p = strjoin(runtime_prefix, "/", *i); + if (!p) + return -ENOMEM; + + /* We execute this synchronously, since we need to be sure this is gone when we start the service + * next. */ + (void) rm_rf(p, REMOVE_ROOT); + } + + return 0; +} + +static void exec_command_done(ExecCommand *c) { + assert(c); + + c->path = mfree(c->path); + c->argv = strv_free(c->argv); +} + +void exec_command_done_array(ExecCommand *c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) + exec_command_done(c+i); +} + +ExecCommand* exec_command_free_list(ExecCommand *c) { + ExecCommand *i; + + while ((i = c)) { + LIST_REMOVE(command, c, i); + exec_command_done(i); + free(i); + } + + return NULL; +} + +void exec_command_free_array(ExecCommand **c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) + c[i] = exec_command_free_list(c[i]); +} + +void exec_command_reset_status_array(ExecCommand *c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) + exec_status_reset(&c[i].exec_status); +} + +void exec_command_reset_status_list_array(ExecCommand **c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) { + ExecCommand *z; + + LIST_FOREACH(command, z, c[i]) + exec_status_reset(&z->exec_status); + } +} + +typedef struct InvalidEnvInfo { + const Unit *unit; + const char *path; +} InvalidEnvInfo; + +static void invalid_env(const char *p, void *userdata) { + InvalidEnvInfo *info = userdata; + + log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path); +} + +const char* exec_context_fdname(const ExecContext *c, int fd_index) { + assert(c); + + switch (fd_index) { + + case STDIN_FILENO: + if (c->std_input != EXEC_INPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDIN_FILENO] ?: "stdin"; + + case STDOUT_FILENO: + if (c->std_output != EXEC_OUTPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDOUT_FILENO] ?: "stdout"; + + case STDERR_FILENO: + if (c->std_error != EXEC_OUTPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDERR_FILENO] ?: "stderr"; + + default: + return NULL; + } +} + +static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]) { + size_t i, targets; + const char* stdio_fdname[3]; + size_t n_fds; + + assert(c); + assert(p); + + targets = (c->std_input == EXEC_INPUT_NAMED_FD) + + (c->std_output == EXEC_OUTPUT_NAMED_FD) + + (c->std_error == EXEC_OUTPUT_NAMED_FD); + + for (i = 0; i < 3; i++) + stdio_fdname[i] = exec_context_fdname(c, i); + + n_fds = p->n_storage_fds + p->n_socket_fds; + + for (i = 0; i < n_fds && targets > 0; i++) + if (named_iofds[STDIN_FILENO] < 0 && + c->std_input == EXEC_INPUT_NAMED_FD && + stdio_fdname[STDIN_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) { + + named_iofds[STDIN_FILENO] = p->fds[i]; + targets--; + + } else if (named_iofds[STDOUT_FILENO] < 0 && + c->std_output == EXEC_OUTPUT_NAMED_FD && + stdio_fdname[STDOUT_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) { + + named_iofds[STDOUT_FILENO] = p->fds[i]; + targets--; + + } else if (named_iofds[STDERR_FILENO] < 0 && + c->std_error == EXEC_OUTPUT_NAMED_FD && + stdio_fdname[STDERR_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) { + + named_iofds[STDERR_FILENO] = p->fds[i]; + targets--; + } + + return targets == 0 ? 0 : -ENOENT; +} + +static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) { + char **i, **r = NULL; + + assert(c); + assert(l); + + STRV_FOREACH(i, c->environment_files) { + char *fn; + int k; + unsigned n; + bool ignore = false; + char **p; + _cleanup_globfree_ glob_t pglob = {}; + + fn = *i; + + if (fn[0] == '-') { + ignore = true; + fn++; + } + + if (!path_is_absolute(fn)) { + if (ignore) + continue; + + strv_free(r); + return -EINVAL; + } + + /* Filename supports globbing, take all matching files */ + k = safe_glob(fn, 0, &pglob); + if (k < 0) { + if (ignore) + continue; + + strv_free(r); + return k; + } + + /* When we don't match anything, -ENOENT should be returned */ + assert(pglob.gl_pathc > 0); + + for (n = 0; n < pglob.gl_pathc; n++) { + k = load_env_file(NULL, pglob.gl_pathv[n], &p); + if (k < 0) { + if (ignore) + continue; + + strv_free(r); + return k; + } + /* Log invalid environment variables with filename */ + if (p) { + InvalidEnvInfo info = { + .unit = unit, + .path = pglob.gl_pathv[n] + }; + + p = strv_env_clean_with_callback(p, invalid_env, &info); + } + + if (!r) + r = p; + else { + char **m; + + m = strv_env_merge(2, r, p); + strv_free(r); + strv_free(p); + if (!m) + return -ENOMEM; + + r = m; + } + } + } + + *l = r; + + return 0; +} + +static bool tty_may_match_dev_console(const char *tty) { + _cleanup_free_ char *resolved = NULL; + + if (!tty) + return true; + + tty = skip_dev_prefix(tty); + + /* trivial identity? */ + if (streq(tty, "console")) + return true; + + if (resolve_dev_console(&resolved) < 0) + return true; /* if we could not resolve, assume it may */ + + /* "tty0" means the active VC, so it may be the same sometimes */ + return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty)); +} + +bool exec_context_may_touch_console(const ExecContext *ec) { + + return (ec->tty_reset || + ec->tty_vhangup || + ec->tty_vt_disallocate || + is_terminal_input(ec->std_input) || + is_terminal_output(ec->std_output) || + is_terminal_output(ec->std_error)) && + tty_may_match_dev_console(exec_context_tty_path(ec)); +} + +static void strv_fprintf(FILE *f, char **l) { + char **g; + + assert(f); + + STRV_FOREACH(g, l) + fprintf(f, " %s", *g); +} + +void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { + ExecDirectoryType dt; + char **e, **d; + unsigned i; + int r; + + assert(c); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%sUMask: %04o\n" + "%sWorkingDirectory: %s\n" + "%sRootDirectory: %s\n" + "%sNonBlocking: %s\n" + "%sPrivateTmp: %s\n" + "%sPrivateDevices: %s\n" + "%sProtectKernelTunables: %s\n" + "%sProtectKernelModules: %s\n" + "%sProtectControlGroups: %s\n" + "%sPrivateNetwork: %s\n" + "%sPrivateUsers: %s\n" + "%sProtectHome: %s\n" + "%sProtectSystem: %s\n" + "%sMountAPIVFS: %s\n" + "%sIgnoreSIGPIPE: %s\n" + "%sMemoryDenyWriteExecute: %s\n" + "%sRestrictRealtime: %s\n" + "%sKeyringMode: %s\n", + prefix, c->umask, + prefix, c->working_directory ? c->working_directory : "/", + prefix, c->root_directory ? c->root_directory : "/", + prefix, yes_no(c->non_blocking), + prefix, yes_no(c->private_tmp), + prefix, yes_no(c->private_devices), + prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_kernel_modules), + prefix, yes_no(c->protect_control_groups), + prefix, yes_no(c->private_network), + prefix, yes_no(c->private_users), + prefix, protect_home_to_string(c->protect_home), + prefix, protect_system_to_string(c->protect_system), + prefix, yes_no(c->mount_apivfs), + prefix, yes_no(c->ignore_sigpipe), + prefix, yes_no(c->memory_deny_write_execute), + prefix, yes_no(c->restrict_realtime), + prefix, exec_keyring_mode_to_string(c->keyring_mode)); + + if (c->root_image) + fprintf(f, "%sRootImage: %s\n", prefix, c->root_image); + + STRV_FOREACH(e, c->environment) + fprintf(f, "%sEnvironment: %s\n", prefix, *e); + + STRV_FOREACH(e, c->environment_files) + fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e); + + STRV_FOREACH(e, c->pass_environment) + fprintf(f, "%sPassEnvironment: %s\n", prefix, *e); + + STRV_FOREACH(e, c->unset_environment) + fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e); + + fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode)); + + for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode); + + STRV_FOREACH(d, c->directories[dt].paths) + fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d); + } + + if (c->nice_set) + fprintf(f, + "%sNice: %i\n", + prefix, c->nice); + + if (c->oom_score_adjust_set) + fprintf(f, + "%sOOMScoreAdjust: %i\n", + prefix, c->oom_score_adjust); + + for (i = 0; i < RLIM_NLIMITS; i++) + if (c->rlimit[i]) { + fprintf(f, "%sLimit%s: " RLIM_FMT "\n", + prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max); + fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n", + prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur); + } + + if (c->ioprio_set) { + _cleanup_free_ char *class_str = NULL; + + r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str); + if (r >= 0) + fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str); + + fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio)); + } + + if (c->cpu_sched_set) { + _cleanup_free_ char *policy_str = NULL; + + r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str); + if (r >= 0) + fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str); + + fprintf(f, + "%sCPUSchedulingPriority: %i\n" + "%sCPUSchedulingResetOnFork: %s\n", + prefix, c->cpu_sched_priority, + prefix, yes_no(c->cpu_sched_reset_on_fork)); + } + + if (c->cpuset) { + fprintf(f, "%sCPUAffinity:", prefix); + for (i = 0; i < c->cpuset_ncpus; i++) + if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset)) + fprintf(f, " %u", i); + fputs("\n", f); + } + + if (c->timer_slack_nsec != NSEC_INFINITY) + fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec); + + fprintf(f, + "%sStandardInput: %s\n" + "%sStandardOutput: %s\n" + "%sStandardError: %s\n", + prefix, exec_input_to_string(c->std_input), + prefix, exec_output_to_string(c->std_output), + prefix, exec_output_to_string(c->std_error)); + + if (c->std_input == EXEC_INPUT_NAMED_FD) + fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]); + if (c->std_output == EXEC_OUTPUT_NAMED_FD) + fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]); + if (c->std_error == EXEC_OUTPUT_NAMED_FD) + fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]); + + if (c->std_input == EXEC_INPUT_FILE) + fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE) + fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE) + fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + + if (c->tty_path) + fprintf(f, + "%sTTYPath: %s\n" + "%sTTYReset: %s\n" + "%sTTYVHangup: %s\n" + "%sTTYVTDisallocate: %s\n", + prefix, c->tty_path, + prefix, yes_no(c->tty_reset), + prefix, yes_no(c->tty_vhangup), + prefix, yes_no(c->tty_vt_disallocate)); + + if (IN_SET(c->std_output, + EXEC_OUTPUT_SYSLOG, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_SYSLOG_AND_CONSOLE, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE) || + IN_SET(c->std_error, + EXEC_OUTPUT_SYSLOG, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_SYSLOG_AND_CONSOLE, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) { + + _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL; + + r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str); + if (r >= 0) + fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str); + + r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str); + if (r >= 0) + fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str); + } + + if (c->log_level_max >= 0) { + _cleanup_free_ char *t = NULL; + + (void) log_level_to_string_alloc(c->log_level_max, &t); + + fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t)); + } + + if (c->log_rate_limit_interval_usec > 0) { + char buf_timespan[FORMAT_TIMESPAN_MAX]; + + fprintf(f, + "%sLogRateLimitIntervalSec: %s\n", + prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC)); + } + + if (c->log_rate_limit_burst > 0) + fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst); + + if (c->n_log_extra_fields > 0) { + size_t j; + + for (j = 0; j < c->n_log_extra_fields; j++) { + fprintf(f, "%sLogExtraFields: ", prefix); + fwrite(c->log_extra_fields[j].iov_base, + 1, c->log_extra_fields[j].iov_len, + f); + fputc('\n', f); + } + } + + if (c->secure_bits) { + _cleanup_free_ char *str = NULL; + + r = secure_bits_to_string_alloc(c->secure_bits, &str); + if (r >= 0) + fprintf(f, "%sSecure Bits: %s\n", prefix, str); + } + + if (c->capability_bounding_set != CAP_ALL) { + _cleanup_free_ char *str = NULL; + + r = capability_set_to_string_alloc(c->capability_bounding_set, &str); + if (r >= 0) + fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str); + } + + if (c->capability_ambient_set != 0) { + _cleanup_free_ char *str = NULL; + + r = capability_set_to_string_alloc(c->capability_ambient_set, &str); + if (r >= 0) + fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str); + } + + if (c->user) + fprintf(f, "%sUser: %s\n", prefix, c->user); + if (c->group) + fprintf(f, "%sGroup: %s\n", prefix, c->group); + + fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user)); + + if (!strv_isempty(c->supplementary_groups)) { + fprintf(f, "%sSupplementaryGroups:", prefix); + strv_fprintf(f, c->supplementary_groups); + fputs("\n", f); + } + + if (c->pam_name) + fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name); + + if (!strv_isempty(c->read_write_paths)) { + fprintf(f, "%sReadWritePaths:", prefix); + strv_fprintf(f, c->read_write_paths); + fputs("\n", f); + } + + if (!strv_isempty(c->read_only_paths)) { + fprintf(f, "%sReadOnlyPaths:", prefix); + strv_fprintf(f, c->read_only_paths); + fputs("\n", f); + } + + if (!strv_isempty(c->inaccessible_paths)) { + fprintf(f, "%sInaccessiblePaths:", prefix); + strv_fprintf(f, c->inaccessible_paths); + fputs("\n", f); + } + + if (c->n_bind_mounts > 0) + for (i = 0; i < c->n_bind_mounts; i++) + fprintf(f, "%s%s: %s%s:%s:%s\n", prefix, + c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths", + c->bind_mounts[i].ignore_enoent ? "-": "", + c->bind_mounts[i].source, + c->bind_mounts[i].destination, + c->bind_mounts[i].recursive ? "rbind" : "norbind"); + + if (c->n_temporary_filesystems > 0) + for (i = 0; i < c->n_temporary_filesystems; i++) { + TemporaryFileSystem *t = c->temporary_filesystems + i; + + fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix, + t->path, + isempty(t->options) ? "" : ":", + strempty(t->options)); + } + + if (c->utmp_id) + fprintf(f, + "%sUtmpIdentifier: %s\n", + prefix, c->utmp_id); + + if (c->selinux_context) + fprintf(f, + "%sSELinuxContext: %s%s\n", + prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context); + + if (c->apparmor_profile) + fprintf(f, + "%sAppArmorProfile: %s%s\n", + prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile); + + if (c->smack_process_label) + fprintf(f, + "%sSmackProcessLabel: %s%s\n", + prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label); + + if (c->personality != PERSONALITY_INVALID) + fprintf(f, + "%sPersonality: %s\n", + prefix, strna(personality_to_string(c->personality))); + + fprintf(f, + "%sLockPersonality: %s\n", + prefix, yes_no(c->lock_personality)); + + if (c->syscall_filter) { +#if HAVE_SECCOMP + Iterator j; + void *id, *val; + bool first = true; +#endif + + fprintf(f, + "%sSystemCallFilter: ", + prefix); + + if (!c->syscall_whitelist) + fputc('~', f); + +#if HAVE_SECCOMP + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) { + _cleanup_free_ char *name = NULL; + const char *errno_name = NULL; + int num = PTR_TO_INT(val); + + if (first) + first = false; + else + fputc(' ', f); + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + fputs(strna(name), f); + + if (num >= 0) { + errno_name = errno_to_name(num); + if (errno_name) + fprintf(f, ":%s", errno_name); + else + fprintf(f, ":%d", num); + } + } +#endif + + fputc('\n', f); + } + + if (c->syscall_archs) { +#if HAVE_SECCOMP + Iterator j; + void *id; +#endif + + fprintf(f, + "%sSystemCallArchitectures:", + prefix); + +#if HAVE_SECCOMP + SET_FOREACH(id, c->syscall_archs, j) + fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1))); +#endif + fputc('\n', f); + } + + if (exec_context_restrict_namespaces_set(c)) { + _cleanup_free_ char *s = NULL; + + r = namespace_flags_to_string(c->restrict_namespaces, &s); + if (r >= 0) + fprintf(f, "%sRestrictNamespaces: %s\n", + prefix, s); + } + + if (c->syscall_errno > 0) { + const char *errno_name; + + fprintf(f, "%sSystemCallErrorNumber: ", prefix); + + errno_name = errno_to_name(c->syscall_errno); + if (errno_name) + fprintf(f, "%s\n", errno_name); + else + fprintf(f, "%d\n", c->syscall_errno); + } + + if (c->apparmor_profile) + fprintf(f, + "%sAppArmorProfile: %s%s\n", + prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile); +} + +bool exec_context_maintains_privileges(const ExecContext *c) { + assert(c); + + /* Returns true if the process forked off would run under + * an unchanged UID or as root. */ + + if (!c->user) + return true; + + if (streq(c->user, "root") || streq(c->user, "0")) + return true; + + return false; +} + +int exec_context_get_effective_ioprio(const ExecContext *c) { + int p; + + assert(c); + + if (c->ioprio_set) + return c->ioprio; + + p = ioprio_get(IOPRIO_WHO_PROCESS, 0); + if (p < 0) + return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4); + + return p; +} + +void exec_context_free_log_extra_fields(ExecContext *c) { + size_t l; + + assert(c); + + for (l = 0; l < c->n_log_extra_fields; l++) + free(c->log_extra_fields[l].iov_base); + c->log_extra_fields = mfree(c->log_extra_fields); + c->n_log_extra_fields = 0; +} + +void exec_status_start(ExecStatus *s, pid_t pid) { + assert(s); + + *s = (ExecStatus) { + .pid = pid, + }; + + dual_timestamp_get(&s->start_timestamp); +} + +void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) { + assert(s); + + if (s->pid != pid) { + *s = (ExecStatus) { + .pid = pid, + }; + } + + dual_timestamp_get(&s->exit_timestamp); + + s->code = code; + s->status = status; + + if (context) { + if (context->utmp_id) + (void) utmp_put_dead_process(context->utmp_id, pid, code, status); + + exec_context_tty_reset(context, NULL); + } +} + +void exec_status_reset(ExecStatus *s) { + assert(s); + + *s = (ExecStatus) {}; +} + +void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESTAMP_MAX]; + + assert(s); + assert(f); + + if (s->pid <= 0) + return; + + prefix = strempty(prefix); + + fprintf(f, + "%sPID: "PID_FMT"\n", + prefix, s->pid); + + if (dual_timestamp_is_set(&s->start_timestamp)) + fprintf(f, + "%sStart Timestamp: %s\n", + prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime)); + + if (dual_timestamp_is_set(&s->exit_timestamp)) + fprintf(f, + "%sExit Timestamp: %s\n" + "%sExit Code: %s\n" + "%sExit Status: %i\n", + prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime), + prefix, sigchld_code_to_string(s->code), + prefix, s->status); +} + +static char *exec_command_line(char **argv) { + size_t k; + char *n, *p, **a; + bool first = true; + + assert(argv); + + k = 1; + STRV_FOREACH(a, argv) + k += strlen(*a)+3; + + n = new(char, k); + if (!n) + return NULL; + + p = n; + STRV_FOREACH(a, argv) { + + if (!first) + *(p++) = ' '; + else + first = false; + + if (strpbrk(*a, WHITESPACE)) { + *(p++) = '\''; + p = stpcpy(p, *a); + *(p++) = '\''; + } else + p = stpcpy(p, *a); + + } + + *p = 0; + + /* FIXME: this doesn't really handle arguments that have + * spaces and ticks in them */ + + return n; +} + +static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) { + _cleanup_free_ char *cmd = NULL; + const char *prefix2; + + assert(c); + assert(f); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + cmd = exec_command_line(c->argv); + fprintf(f, + "%sCommand Line: %s\n", + prefix, cmd ? cmd : strerror(ENOMEM)); + + exec_status_dump(&c->exec_status, f, prefix2); +} + +void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) { + assert(f); + + prefix = strempty(prefix); + + LIST_FOREACH(command, c, c) + exec_command_dump(c, f, prefix); +} + +void exec_command_append_list(ExecCommand **l, ExecCommand *e) { + ExecCommand *end; + + assert(l); + assert(e); + + if (*l) { + /* It's kind of important, that we keep the order here */ + LIST_FIND_TAIL(command, *l, end); + LIST_INSERT_AFTER(command, *l, end, e); + } else + *l = e; +} + +int exec_command_set(ExecCommand *c, const char *path, ...) { + va_list ap; + char **l, *p; + + assert(c); + assert(path); + + va_start(ap, path); + l = strv_new_ap(path, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + + p = strdup(path); + if (!p) { + strv_free(l); + return -ENOMEM; + } + + free_and_replace(c->path, p); + + return strv_free_and_replace(c->argv, l); +} + +int exec_command_append(ExecCommand *c, const char *path, ...) { + _cleanup_strv_free_ char **l = NULL; + va_list ap; + int r; + + assert(c); + assert(path); + + va_start(ap, path); + l = strv_new_ap(path, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + + r = strv_extend_strv(&c->argv, l, false); + if (r < 0) + return r; + + return 0; +} + +static void *remove_tmpdir_thread(void *p) { + _cleanup_free_ char *path = p; + + (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL); + return NULL; +} + +static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) { + int r; + + if (!rt) + return NULL; + + if (rt->manager) + (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id); + + /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */ + if (destroy && rt->tmp_dir) { + log_debug("Spawning thread to nuke %s", rt->tmp_dir); + + r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir); + if (r < 0) { + log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir); + free(rt->tmp_dir); + } + + rt->tmp_dir = NULL; + } + + if (destroy && rt->var_tmp_dir) { + log_debug("Spawning thread to nuke %s", rt->var_tmp_dir); + + r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir); + if (r < 0) { + log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir); + free(rt->var_tmp_dir); + } + + rt->var_tmp_dir = NULL; + } + + rt->id = mfree(rt->id); + rt->tmp_dir = mfree(rt->tmp_dir); + rt->var_tmp_dir = mfree(rt->var_tmp_dir); + safe_close_pair(rt->netns_storage_socket); + return mfree(rt); +} + +static void exec_runtime_freep(ExecRuntime **rt) { + if (*rt) + (void) exec_runtime_free(*rt, false); +} + +static int exec_runtime_allocate(ExecRuntime **rt) { + assert(rt); + + *rt = new0(ExecRuntime, 1); + if (!*rt) + return -ENOMEM; + + (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1; + return 0; +} + +static int exec_runtime_add( + Manager *m, + const char *id, + const char *tmp_dir, + const char *var_tmp_dir, + const int netns_storage_socket[2], + ExecRuntime **ret) { + + _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL; + int r; + + assert(m); + assert(id); + + r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops); + if (r < 0) + return r; + + r = exec_runtime_allocate(&rt); + if (r < 0) + return r; + + rt->id = strdup(id); + if (!rt->id) + return -ENOMEM; + + if (tmp_dir) { + rt->tmp_dir = strdup(tmp_dir); + if (!rt->tmp_dir) + return -ENOMEM; + + /* When tmp_dir is set, then we require var_tmp_dir is also set. */ + assert(var_tmp_dir); + rt->var_tmp_dir = strdup(var_tmp_dir); + if (!rt->var_tmp_dir) + return -ENOMEM; + } + + if (netns_storage_socket) { + rt->netns_storage_socket[0] = netns_storage_socket[0]; + rt->netns_storage_socket[1] = netns_storage_socket[1]; + } + + r = hashmap_put(m->exec_runtime_by_id, rt->id, rt); + if (r < 0) + return r; + + rt->manager = m; + + if (ret) + *ret = rt; + + /* do not remove created ExecRuntime object when the operation succeeds. */ + rt = NULL; + return 0; +} + +static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) { + _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL; + _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1}; + int r; + + assert(m); + assert(c); + assert(id); + + /* It is not necessary to create ExecRuntime object. */ + if (!c->private_network && !c->private_tmp) + return 0; + + if (c->private_tmp) { + r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir); + if (r < 0) + return r; + } + + if (c->private_network) { + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0) + return -errno; + } + + r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret); + if (r < 0) + return r; + + /* Avoid cleanup */ + netns_storage_socket[0] = -1; + netns_storage_socket[1] = -1; + return 1; +} + +int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) { + ExecRuntime *rt; + int r; + + assert(m); + assert(id); + assert(ret); + + rt = hashmap_get(m->exec_runtime_by_id, id); + if (rt) + /* We already have a ExecRuntime object, let's increase the ref count and reuse it */ + goto ref; + + if (!create) + return 0; + + /* If not found, then create a new object. */ + r = exec_runtime_make(m, c, id, &rt); + if (r <= 0) + /* When r == 0, it is not necessary to create ExecRuntime object. */ + return r; + +ref: + /* increment reference counter. */ + rt->n_ref++; + *ret = rt; + return 1; +} + +ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) { + if (!rt) + return NULL; + + assert(rt->n_ref > 0); + + rt->n_ref--; + if (rt->n_ref > 0) + return NULL; + + return exec_runtime_free(rt, destroy); +} + +int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) { + ExecRuntime *rt; + Iterator i; + + assert(m); + assert(f); + assert(fds); + + HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) { + fprintf(f, "exec-runtime=%s", rt->id); + + if (rt->tmp_dir) + fprintf(f, " tmp-dir=%s", rt->tmp_dir); + + if (rt->var_tmp_dir) + fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir); + + if (rt->netns_storage_socket[0] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->netns_storage_socket[0]); + if (copy < 0) + return copy; + + fprintf(f, " netns-socket-0=%i", copy); + } + + if (rt->netns_storage_socket[1] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->netns_storage_socket[1]); + if (copy < 0) + return copy; + + fprintf(f, " netns-socket-1=%i", copy); + } + + fputc('\n', f); + } + + return 0; +} + +int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) { + _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL; + ExecRuntime *rt; + int r; + + /* This is for the migration from old (v237 or earlier) deserialization text. + * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=. + * Even if the ExecRuntime object originally created by the other unit, we cannot judge + * so or not from the serialized text, then we always creates a new object owned by this. */ + + assert(u); + assert(key); + assert(value); + + /* Manager manages ExecRuntime objects by the unit id. + * So, we omit the serialized text when the unit does not have id (yet?)... */ + if (isempty(u->id)) { + log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter."); + return 0; + } + + r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m"); + return 0; + } + + rt = hashmap_get(u->manager->exec_runtime_by_id, u->id); + if (!rt) { + r = exec_runtime_allocate(&rt_create); + if (r < 0) + return log_oom(); + + rt_create->id = strdup(u->id); + if (!rt_create->id) + return log_oom(); + + rt = rt_create; + } + + if (streq(key, "tmp-dir")) { + char *copy; + + copy = strdup(value); + if (!copy) + return log_oom(); + + free_and_replace(rt->tmp_dir, copy); + + } else if (streq(key, "var-tmp-dir")) { + char *copy; + + copy = strdup(value); + if (!copy) + return log_oom(); + + free_and_replace(rt->var_tmp_dir, copy); + + } else if (streq(key, "netns-socket-0")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Failed to parse netns socket value: %s", value); + return 0; + } + + safe_close(rt->netns_storage_socket[0]); + rt->netns_storage_socket[0] = fdset_remove(fds, fd); + + } else if (streq(key, "netns-socket-1")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Failed to parse netns socket value: %s", value); + return 0; + } + + safe_close(rt->netns_storage_socket[1]); + rt->netns_storage_socket[1] = fdset_remove(fds, fd); + } else + return 0; + + /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */ + if (rt_create) { + r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m"); + return 0; + } + + rt_create->manager = u->manager; + + /* Avoid cleanup */ + rt_create = NULL; + } + + return 1; +} + +void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) { + char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; + int r, fd0 = -1, fd1 = -1; + const char *p, *v = value; + size_t n; + + assert(m); + assert(value); + assert(fds); + + n = strcspn(v, " "); + id = strndupa(v, n); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + + v = startswith(p, "tmp-dir="); + if (v) { + n = strcspn(v, " "); + tmp_dir = strndupa(v, n); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "var-tmp-dir="); + if (v) { + n = strcspn(v, " "); + var_tmp_dir = strndupa(v, n); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "netns-socket-0="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa(v, n); + if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) { + log_debug("Unable to process exec-runtime netns fd specification."); + return; + } + fd0 = fdset_remove(fds, fd0); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "netns-socket-1="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa(v, n); + if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) { + log_debug("Unable to process exec-runtime netns fd specification."); + return; + } + fd1 = fdset_remove(fds, fd1); + } + +finalize: + + r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL); + if (r < 0) + log_debug_errno(r, "Failed to add exec-runtime: %m"); +} + +void exec_runtime_vacuum(Manager *m) { + ExecRuntime *rt; + Iterator i; + + assert(m); + + /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */ + + HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) { + if (rt->n_ref > 0) + continue; + + (void) exec_runtime_free(rt, false); + } +} + +void exec_params_clear(ExecParameters *p) { + if (!p) + return; + + strv_free(p->environment); +} + +static const char* const exec_input_table[_EXEC_INPUT_MAX] = { + [EXEC_INPUT_NULL] = "null", + [EXEC_INPUT_TTY] = "tty", + [EXEC_INPUT_TTY_FORCE] = "tty-force", + [EXEC_INPUT_TTY_FAIL] = "tty-fail", + [EXEC_INPUT_SOCKET] = "socket", + [EXEC_INPUT_NAMED_FD] = "fd", + [EXEC_INPUT_DATA] = "data", + [EXEC_INPUT_FILE] = "file", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput); + +static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = { + [EXEC_OUTPUT_INHERIT] = "inherit", + [EXEC_OUTPUT_NULL] = "null", + [EXEC_OUTPUT_TTY] = "tty", + [EXEC_OUTPUT_SYSLOG] = "syslog", + [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console", + [EXEC_OUTPUT_KMSG] = "kmsg", + [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console", + [EXEC_OUTPUT_JOURNAL] = "journal", + [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console", + [EXEC_OUTPUT_SOCKET] = "socket", + [EXEC_OUTPUT_NAMED_FD] = "fd", + [EXEC_OUTPUT_FILE] = "file", + [EXEC_OUTPUT_FILE_APPEND] = "append", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput); + +static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = { + [EXEC_UTMP_INIT] = "init", + [EXEC_UTMP_LOGIN] = "login", + [EXEC_UTMP_USER] = "user", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode); + +static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = { + [EXEC_PRESERVE_NO] = "no", + [EXEC_PRESERVE_YES] = "yes", + [EXEC_PRESERVE_RESTART] = "restart", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES); + +static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory", + [EXEC_DIRECTORY_STATE] = "StateDirectory", + [EXEC_DIRECTORY_CACHE] = "CacheDirectory", + [EXEC_DIRECTORY_LOGS] = "LogsDirectory", + [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType); + +static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY", + [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY", + [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY", + [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY", + [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType); + +static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = { + [EXEC_KEYRING_INHERIT] = "inherit", + [EXEC_KEYRING_PRIVATE] = "private", + [EXEC_KEYRING_SHARED] = "shared", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode); diff --git a/src/core/execute.h b/src/core/execute.h new file mode 100644 index 0000000..0f1bf56 --- /dev/null +++ b/src/core/execute.h @@ -0,0 +1,404 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct ExecStatus ExecStatus; +typedef struct ExecCommand ExecCommand; +typedef struct ExecContext ExecContext; +typedef struct ExecRuntime ExecRuntime; +typedef struct ExecParameters ExecParameters; +typedef struct Manager Manager; + +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <sys/capability.h> + +#include "cgroup-util.h" +#include "fdset.h" +#include "list.h" +#include "missing_resource.h" +#include "namespace.h" +#include "nsflags.h" + +#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U) + +typedef enum ExecUtmpMode { + EXEC_UTMP_INIT, + EXEC_UTMP_LOGIN, + EXEC_UTMP_USER, + _EXEC_UTMP_MODE_MAX, + _EXEC_UTMP_MODE_INVALID = -1 +} ExecUtmpMode; + +typedef enum ExecInput { + EXEC_INPUT_NULL, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL, + EXEC_INPUT_SOCKET, + EXEC_INPUT_NAMED_FD, + EXEC_INPUT_DATA, + EXEC_INPUT_FILE, + _EXEC_INPUT_MAX, + _EXEC_INPUT_INVALID = -1 +} ExecInput; + +typedef enum ExecOutput { + EXEC_OUTPUT_INHERIT, + EXEC_OUTPUT_NULL, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_SYSLOG, + EXEC_OUTPUT_SYSLOG_AND_CONSOLE, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_SOCKET, + EXEC_OUTPUT_NAMED_FD, + EXEC_OUTPUT_FILE, + EXEC_OUTPUT_FILE_APPEND, + _EXEC_OUTPUT_MAX, + _EXEC_OUTPUT_INVALID = -1 +} ExecOutput; + +typedef enum ExecPreserveMode { + EXEC_PRESERVE_NO, + EXEC_PRESERVE_YES, + EXEC_PRESERVE_RESTART, + _EXEC_PRESERVE_MODE_MAX, + _EXEC_PRESERVE_MODE_INVALID = -1 +} ExecPreserveMode; + +typedef enum ExecKeyringMode { + EXEC_KEYRING_INHERIT, + EXEC_KEYRING_PRIVATE, + EXEC_KEYRING_SHARED, + _EXEC_KEYRING_MODE_MAX, + _EXEC_KEYRING_MODE_INVALID = -1, +} ExecKeyringMode; + +/* Contains start and exit information about an executed command. */ +struct ExecStatus { + pid_t pid; + dual_timestamp start_timestamp; + dual_timestamp exit_timestamp; + int code; /* as in siginfo_t::si_code */ + int status; /* as in sigingo_t::si_status */ +}; + +typedef enum ExecCommandFlags { + EXEC_COMMAND_IGNORE_FAILURE = 1 << 0, + EXEC_COMMAND_FULLY_PRIVILEGED = 1 << 1, + EXEC_COMMAND_NO_SETUID = 1 << 2, + EXEC_COMMAND_AMBIENT_MAGIC = 1 << 3, +} ExecCommandFlags; + +/* Stores information about commands we execute. Covers both configuration settings as well as runtime data. */ +struct ExecCommand { + char *path; + char **argv; + ExecStatus exec_status; + ExecCommandFlags flags; + LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */ +}; + +/* Encapsulates certain aspects of the runtime environment that is to be shared between multiple otherwise separate + * invocations of commands. Specifically, this allows sharing of /tmp and /var/tmp data as well as network namespaces + * between invocations of commands. This is a reference counted object, with one reference taken by each currently + * active command invocation that wants to share this runtime. */ +struct ExecRuntime { + unsigned n_ref; + + Manager *manager; + + char *id; /* Unit id of the owner */ + + char *tmp_dir; + char *var_tmp_dir; + + /* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network + * namespace. */ + int netns_storage_socket[2]; +}; + +typedef enum ExecDirectoryType { + EXEC_DIRECTORY_RUNTIME = 0, + EXEC_DIRECTORY_STATE, + EXEC_DIRECTORY_CACHE, + EXEC_DIRECTORY_LOGS, + EXEC_DIRECTORY_CONFIGURATION, + _EXEC_DIRECTORY_TYPE_MAX, + _EXEC_DIRECTORY_TYPE_INVALID = -1, +} ExecDirectoryType; + +typedef struct ExecDirectory { + char **paths; + mode_t mode; +} ExecDirectory; + +/* Encodes configuration parameters applied to invoked commands. Does not carry runtime data, but only configuration + * changes sourced from unit files and suchlike. ExecContext objects are usually embedded into Unit objects, and do not + * change after being loaded. */ +struct ExecContext { + char **environment; + char **environment_files; + char **pass_environment; + char **unset_environment; + + struct rlimit *rlimit[_RLIMIT_MAX]; + char *working_directory, *root_directory, *root_image; + bool working_directory_missing_ok; + bool working_directory_home; + + mode_t umask; + int oom_score_adjust; + int nice; + int ioprio; + int cpu_sched_policy; + int cpu_sched_priority; + + cpu_set_t *cpuset; + unsigned cpuset_ncpus; + + ExecInput std_input; + ExecOutput std_output; + ExecOutput std_error; + char *stdio_fdname[3]; + char *stdio_file[3]; + + void *stdin_data; + size_t stdin_data_size; + + nsec_t timer_slack_nsec; + + bool stdio_as_fds; + + char *tty_path; + + bool tty_reset; + bool tty_vhangup; + bool tty_vt_disallocate; + + bool ignore_sigpipe; + + /* Since resolving these names might involve socket + * connections and we don't want to deadlock ourselves these + * names are resolved on execution only and in the child + * process. */ + char *user; + char *group; + char **supplementary_groups; + + char *pam_name; + + char *utmp_id; + ExecUtmpMode utmp_mode; + + bool selinux_context_ignore; + char *selinux_context; + + bool apparmor_profile_ignore; + char *apparmor_profile; + + bool smack_process_label_ignore; + char *smack_process_label; + + ExecKeyringMode keyring_mode; + + char **read_write_paths, **read_only_paths, **inaccessible_paths; + unsigned long mount_flags; + BindMount *bind_mounts; + size_t n_bind_mounts; + TemporaryFileSystem *temporary_filesystems; + size_t n_temporary_filesystems; + + uint64_t capability_bounding_set; + uint64_t capability_ambient_set; + int secure_bits; + + int syslog_priority; + char *syslog_identifier; + bool syslog_level_prefix; + + int log_level_max; + + struct iovec* log_extra_fields; + size_t n_log_extra_fields; + + usec_t log_rate_limit_interval_usec; + unsigned log_rate_limit_burst; + + bool cpu_sched_reset_on_fork; + bool non_blocking; + bool private_tmp; + bool private_network; + bool private_devices; + bool private_users; + bool private_mounts; + ProtectSystem protect_system; + ProtectHome protect_home; + bool protect_kernel_tunables; + bool protect_kernel_modules; + bool protect_control_groups; + bool mount_apivfs; + + bool no_new_privileges; + + bool dynamic_user; + bool remove_ipc; + + /* This is not exposed to the user but available + * internally. We need it to make sure that whenever we spawn + * /usr/bin/mount it is run in the same process group as us so + * that the autofs logic detects that it belongs to us and we + * don't enter a trigger loop. */ + bool same_pgrp; + + unsigned long personality; + bool lock_personality; + + unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */ + + Hashmap *syscall_filter; + Set *syscall_archs; + int syscall_errno; + bool syscall_whitelist:1; + + Set *address_families; + bool address_families_whitelist:1; + + ExecPreserveMode runtime_directory_preserve_mode; + ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX]; + + bool memory_deny_write_execute; + bool restrict_realtime; + + bool oom_score_adjust_set:1; + bool nice_set:1; + bool ioprio_set:1; + bool cpu_sched_set:1; +}; + +static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { + assert(c); + + return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL; +} + +typedef enum ExecFlags { + EXEC_APPLY_SANDBOXING = 1 << 0, + EXEC_APPLY_CHROOT = 1 << 1, + EXEC_APPLY_TTY_STDIN = 1 << 2, + EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */ + EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */ + EXEC_NSS_BYPASS_BUS = 1 << 5, /* Set the SYSTEMD_NSS_BYPASS_BUS environment variable, to disable nss-systemd for dbus */ + EXEC_CGROUP_DELEGATE = 1 << 6, + EXEC_IS_CONTROL = 1 << 7, + EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */ + + /* The following are not used by execute.c, but by consumers internally */ + EXEC_PASS_FDS = 1 << 9, + EXEC_SETENV_RESULT = 1 << 10, + EXEC_SET_WATCHDOG = 1 << 11, +} ExecFlags; + +/* Parameters for a specific invocation of a command. This structure is put together right before a command is + * executed. */ +struct ExecParameters { + char **environment; + + int *fds; + char **fd_names; + size_t n_socket_fds; + size_t n_storage_fds; + + ExecFlags flags; + bool selinux_context_net:1; + + CGroupMask cgroup_supported; + const char *cgroup_path; + + char **prefix; + + const char *confirm_spawn; + + usec_t watchdog_usec; + + int *idle_pipe; + + int stdin_fd; + int stdout_fd; + int stderr_fd; + + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + int exec_fd; +}; + +#include "unit.h" +#include "dynamic-user.h" + +int exec_spawn(Unit *unit, + ExecCommand *command, + const ExecContext *context, + const ExecParameters *exec_params, + ExecRuntime *runtime, + DynamicCreds *dynamic_creds, + pid_t *ret); + +void exec_command_done_array(ExecCommand *c, size_t n); +ExecCommand* exec_command_free_list(ExecCommand *c); +void exec_command_free_array(ExecCommand **c, size_t n); +void exec_command_reset_status_array(ExecCommand *c, size_t n); +void exec_command_reset_status_list_array(ExecCommand **c, size_t n); +void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix); +void exec_command_append_list(ExecCommand **l, ExecCommand *e); +int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_; +int exec_command_append(ExecCommand *c, const char *path, ...) _sentinel_; + +void exec_context_init(ExecContext *c); +void exec_context_done(ExecContext *c); +void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix); + +int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root); + +const char* exec_context_fdname(const ExecContext *c, int fd_index); + +bool exec_context_may_touch_console(const ExecContext *c); +bool exec_context_maintains_privileges(const ExecContext *c); + +int exec_context_get_effective_ioprio(const ExecContext *c); + +void exec_context_free_log_extra_fields(ExecContext *c); + +void exec_status_start(ExecStatus *s, pid_t pid); +void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status); +void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix); +void exec_status_reset(ExecStatus *s); + +int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecRuntime **ret); +ExecRuntime *exec_runtime_unref(ExecRuntime *r, bool destroy); + +int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds); +int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds); +void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); +void exec_runtime_vacuum(Manager *m); + +void exec_params_clear(ExecParameters *p); + +const char* exec_output_to_string(ExecOutput i) _const_; +ExecOutput exec_output_from_string(const char *s) _pure_; + +const char* exec_input_to_string(ExecInput i) _const_; +ExecInput exec_input_from_string(const char *s) _pure_; + +const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_; +ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_; + +const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_; +ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_; + +const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_; +ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_; + +const char* exec_directory_type_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_; diff --git a/src/core/hostname-setup.c b/src/core/hostname-setup.c new file mode 100644 index 0000000..83cce88 --- /dev/null +++ b/src/core/hostname-setup.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> + +#include "alloc-util.h" +#include "fileio.h" +#include "hostname-setup.h" +#include "hostname-util.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "util.h" + +int hostname_setup(void) { + _cleanup_free_ char *b = NULL; + bool enoent = false; + const char *hn; + int r; + + r = read_etc_hostname(NULL, &b); + if (r < 0) { + if (r == -ENOENT) + enoent = true; + else + log_warning_errno(r, "Failed to read configured hostname: %m"); + + hn = NULL; + } else + hn = b; + + if (isempty(hn)) { + /* Don't override the hostname if it is already set + * and not explicitly configured */ + if (hostname_is_set()) + return 0; + + if (enoent) + log_info("No hostname configured."); + + hn = FALLBACK_HOSTNAME; + } + + r = sethostname_idempotent(hn); + if (r < 0) + return log_warning_errno(r, "Failed to set hostname to <%s>: %m", hn); + + log_info("Set hostname to <%s>.", hn); + return 0; +} diff --git a/src/core/hostname-setup.h b/src/core/hostname-setup.h new file mode 100644 index 0000000..dc7b9a6 --- /dev/null +++ b/src/core/hostname-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int hostname_setup(void); diff --git a/src/core/ima-setup.c b/src/core/ima-setup.c new file mode 100644 index 0000000..fd7c5f6 --- /dev/null +++ b/src/core/ima-setup.c @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy + TORSEC group — http://security.polito.it +***/ + +#include <errno.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "ima-setup.h" +#include "log.h" +#include "util.h" + +#define IMA_SECFS_DIR "/sys/kernel/security/ima" +#define IMA_SECFS_POLICY IMA_SECFS_DIR "/policy" +#define IMA_POLICY_PATH "/etc/ima/ima-policy" + +int ima_setup(void) { +#if ENABLE_IMA + _cleanup_fclose_ FILE *input = NULL; + _cleanup_close_ int imafd = -1; + unsigned lineno = 0; + int r; + + if (access(IMA_SECFS_DIR, F_OK) < 0) { + log_debug_errno(errno, "IMA support is disabled in the kernel, ignoring: %m"); + return 0; + } + + if (access(IMA_SECFS_POLICY, W_OK) < 0) { + log_warning_errno(errno, "Another IMA custom policy has already been loaded, ignoring: %m"); + return 0; + } + + if (access(IMA_POLICY_PATH, F_OK) < 0) { + log_debug_errno(errno, "No IMA custom policy file "IMA_POLICY_PATH", ignoring: %m"); + return 0; + } + + imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC); + if (imafd < 0) { + log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m"); + return 0; + } + + /* attempt to write the name of the policy file into sysfs file */ + if (write(imafd, IMA_POLICY_PATH, STRLEN(IMA_POLICY_PATH)) > 0) + goto done; + + /* fall back to copying the policy line-by-line */ + input = fopen(IMA_POLICY_PATH, "re"); + if (!input) { + log_warning_errno(errno, "Failed to open the IMA custom policy file "IMA_POLICY_PATH", ignoring: %m"); + return 0; + } + + safe_close(imafd); + + imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC); + if (imafd < 0) { + log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m"); + return 0; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + size_t len; + + r = read_line(input, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read the IMA custom policy file "IMA_POLICY_PATH": %m"); + if (r == 0) + break; + + len = strlen(line); + lineno++; + + if (len > 0 && write(imafd, line, len) < 0) + return log_error_errno(errno, "Failed to load the IMA custom policy file "IMA_POLICY_PATH"%u: %m", + lineno); + } + +done: + log_info("Successfully loaded the IMA custom policy "IMA_POLICY_PATH"."); +#endif /* ENABLE_IMA */ + return 0; +} diff --git a/src/core/ima-setup.h b/src/core/ima-setup.h new file mode 100644 index 0000000..cf47879 --- /dev/null +++ b/src/core/ima-setup.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +/*** + Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy + TORSEC group — http://security.polito.it +***/ + +int ima_setup(void); diff --git a/src/core/ip-address-access.c b/src/core/ip-address-access.c new file mode 100644 index 0000000..1d431bb --- /dev/null +++ b/src/core/ip-address-access.c @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <stdio.h> +#include <stdlib.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "extract-word.h" +#include "hostname-util.h" +#include "ip-address-access.h" +#include "parse-util.h" +#include "string-util.h" + +int config_parse_ip_address_access( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + IPAddressAccessItem **list = data; + const char *p; + int r; + + assert(list); + + if (isempty(rvalue)) { + *list = ip_address_access_free_all(*list); + return 0; + } + + p = rvalue; + + for (;;) { + _cleanup_free_ IPAddressAccessItem *a = NULL; + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + if (streq(word, "any")) { + /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */ + + a->family = AF_INET; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + + } else if (is_localhost(word)) { + /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32(0x7f000000); + a->prefixlen = 8; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) IN6ADDR_LOOPBACK_INIT; + a->prefixlen = 128; + + } else if (streq(word, "link-local")) { + + /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32((UINT32_C(169) << 24 | UINT32_C(254) << 16)); + a->prefixlen = 16; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) { + .s6_addr32[0] = htobe32(0xfe800000) + }; + a->prefixlen = 64; + + } else if (streq(word, "multicast")) { + + /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32((UINT32_C(224) << 24)); + a->prefixlen = 4; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) { + .s6_addr32[0] = htobe32(0xff000000) + }; + a->prefixlen = 8; + + } else { + r = in_addr_prefix_from_string_auto(word, &a->family, &a->address, &a->prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Address prefix is invalid, ignoring assignment: %s", word); + return 0; + } + } + + LIST_APPEND(items, *list, a); + a = NULL; + } + + *list = ip_address_access_reduce(*list); + + if (*list) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == BPF_FIREWALL_UNSUPPORTED) { + static bool warned = false; + + log_full(warned ? LOG_DEBUG : LOG_WARNING, + "File %s:%u configures an IP firewall (%s=%s), but the local system does not support BPF/cgroup based firewalling.\n" + "Proceeding WITHOUT firewalling in effect! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue); + + warned = true; + } + } + + return 0; +} + +IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first) { + IPAddressAccessItem *next, *p = first; + + while (p) { + next = p->items_next; + free(p); + + p = next; + } + + return NULL; +} + +IPAddressAccessItem* ip_address_access_reduce(IPAddressAccessItem *first) { + IPAddressAccessItem *a, *b, *tmp; + int r; + + /* Drops all entries from the list that are covered by another entry in full, thus removing all redundant + * entries. */ + + LIST_FOREACH_SAFE(items, a, tmp, first) { + + /* Drop irrelevant bits */ + (void) in_addr_mask(a->family, &a->address, a->prefixlen); + + LIST_FOREACH(items, b, first) { + + if (a == b) + continue; + + if (a->family != b->family) + continue; + + if (b->prefixlen > a->prefixlen) + continue; + + r = in_addr_prefix_covers(b->family, + &b->address, + b->prefixlen, + &a->address); + if (r > 0) { + /* b covers a fully, then let's drop a */ + LIST_REMOVE(items, first, a); + free(a); + break; + } + } + } + + return first; +} diff --git a/src/core/ip-address-access.h b/src/core/ip-address-access.h new file mode 100644 index 0000000..77078e1 --- /dev/null +++ b/src/core/ip-address-access.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "list.h" + +typedef struct IPAddressAccessItem IPAddressAccessItem; + +struct IPAddressAccessItem { + int family; + unsigned char prefixlen; + union in_addr_union address; + LIST_FIELDS(IPAddressAccessItem, items); +}; + +CONFIG_PARSER_PROTOTYPE(config_parse_ip_address_access); + +IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first); + +IPAddressAccessItem* ip_address_access_reduce(IPAddressAccessItem *first); diff --git a/src/core/job.c b/src/core/job.c new file mode 100644 index 0000000..59bb9d2 --- /dev/null +++ b/src/core/job.c @@ -0,0 +1,1668 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> + +#include "sd-id128.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "dbus-job.h" +#include "dbus.h" +#include "escape.h" +#include "fileio.h" +#include "job.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "serialize.h" +#include "set.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "unit.h" +#include "virt.h" + +Job* job_new_raw(Unit *unit) { + Job *j; + + /* used for deserialization */ + + assert(unit); + + j = new(Job, 1); + if (!j) + return NULL; + + *j = (Job) { + .manager = unit->manager, + .unit = unit, + .type = _JOB_TYPE_INVALID, + }; + + return j; +} + +Job* job_new(Unit *unit, JobType type) { + Job *j; + + assert(type < _JOB_TYPE_MAX); + + j = job_new_raw(unit); + if (!j) + return NULL; + + j->id = j->manager->current_job_id++; + j->type = type; + + /* We don't link it here, that's what job_dependency() is for */ + + return j; +} + +void job_unlink(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + if (j->in_run_queue) { + LIST_REMOVE(run_queue, j->manager->run_queue, j); + j->in_run_queue = false; + } + + if (j->in_dbus_queue) { + LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + } + + if (j->in_gc_queue) { + LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j); + j->in_gc_queue = false; + } + + j->timer_event_source = sd_event_source_unref(j->timer_event_source); +} + +Job* job_free(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + job_unlink(j); + + sd_bus_track_unref(j->bus_track); + strv_free(j->deserialized_clients); + + return mfree(j); +} + +static void job_set_state(Job *j, JobState state) { + assert(j); + assert(state >= 0); + assert(state < _JOB_STATE_MAX); + + if (j->state == state) + return; + + j->state = state; + + if (!j->installed) + return; + + if (j->state == JOB_RUNNING) + j->unit->manager->n_running_jobs++; + else { + assert(j->state == JOB_WAITING); + assert(j->unit->manager->n_running_jobs > 0); + + j->unit->manager->n_running_jobs--; + + if (j->unit->manager->n_running_jobs <= 0) + j->unit->manager->jobs_in_progress_event_source = sd_event_source_unref(j->unit->manager->jobs_in_progress_event_source); + } +} + +void job_uninstall(Job *j) { + Job **pj; + + assert(j->installed); + + job_set_state(j, JOB_WAITING); + + pj = (j->type == JOB_NOP) ? &j->unit->nop_job : &j->unit->job; + assert(*pj == j); + + /* Detach from next 'bigger' objects */ + + /* daemon-reload should be transparent to job observers */ + if (!MANAGER_IS_RELOADING(j->manager)) + bus_job_send_removed_signal(j); + + *pj = NULL; + + unit_add_to_gc_queue(j->unit); + + unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */ + + hashmap_remove_value(j->manager->jobs, UINT32_TO_PTR(j->id), j); + j->installed = false; +} + +static bool job_type_allows_late_merge(JobType t) { + /* Tells whether it is OK to merge a job of type 't' with an already + * running job. + * Reloads cannot be merged this way. Think of the sequence: + * 1. Reload of a daemon is in progress; the daemon has already loaded + * its config file, but hasn't completed the reload operation yet. + * 2. Edit foo's config file. + * 3. Trigger another reload to have the daemon use the new config. + * Should the second reload job be merged into the first one, the daemon + * would not know about the new config. + * JOB_RESTART jobs on the other hand can be merged, because they get + * patched into JOB_START after stopping the unit. So if we see a + * JOB_RESTART running, it means the unit hasn't stopped yet and at + * this time the merge is still allowed. */ + return t != JOB_RELOAD; +} + +static void job_merge_into_installed(Job *j, Job *other) { + assert(j->installed); + assert(j->unit == other->unit); + + if (j->type != JOB_NOP) + assert_se(job_type_merge_and_collapse(&j->type, other->type, j->unit) == 0); + else + assert(other->type == JOB_NOP); + + j->irreversible = j->irreversible || other->irreversible; + j->ignore_order = j->ignore_order || other->ignore_order; +} + +Job* job_install(Job *j) { + Job **pj; + Job *uj; + + assert(!j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(j->state == JOB_WAITING); + + pj = (j->type == JOB_NOP) ? &j->unit->nop_job : &j->unit->job; + uj = *pj; + + if (uj) { + if (job_type_is_conflicting(uj->type, j->type)) + job_finish_and_invalidate(uj, JOB_CANCELED, false, false); + else { + /* not conflicting, i.e. mergeable */ + + if (uj->state == JOB_WAITING || + (job_type_allows_late_merge(j->type) && job_type_is_superset(uj->type, j->type))) { + job_merge_into_installed(uj, j); + log_unit_debug(uj->unit, + "Merged %s/%s into installed job %s/%s as %"PRIu32, + j->unit->id, job_type_to_string(j->type), uj->unit->id, + job_type_to_string(uj->type), uj->id); + return uj; + } else { + /* already running and not safe to merge into */ + /* Patch uj to become a merged job and re-run it. */ + /* XXX It should be safer to queue j to run after uj finishes, but it is + * not currently possible to have more than one installed job per unit. */ + job_merge_into_installed(uj, j); + log_unit_debug(uj->unit, + "Merged into running job, re-running: %s/%s as %"PRIu32, + uj->unit->id, job_type_to_string(uj->type), uj->id); + + job_set_state(uj, JOB_WAITING); + return uj; + } + } + } + + /* Install the job */ + *pj = j; + j->installed = true; + + j->manager->n_installed_jobs++; + log_unit_debug(j->unit, + "Installed new job %s/%s as %u", + j->unit->id, job_type_to_string(j->type), (unsigned) j->id); + + job_add_to_gc_queue(j); + + job_add_to_dbus_queue(j); /* announce this job to clients */ + unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */ + + return j; +} + +int job_install_deserialized(Job *j) { + Job **pj; + int r; + + assert(!j->installed); + + if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION) + return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EINVAL), + "Invalid job type %s in deserialization.", + strna(job_type_to_string(j->type))); + + pj = (j->type == JOB_NOP) ? &j->unit->nop_job : &j->unit->job; + if (*pj) + return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EEXIST), + "Unit already has a job installed. Not installing deserialized job."); + + r = hashmap_put(j->manager->jobs, UINT32_TO_PTR(j->id), j); + if (r == -EEXIST) + return log_unit_debug_errno(j->unit, r, "Job ID %" PRIu32 " already used, cannot deserialize job.", j->id); + if (r < 0) + return log_unit_debug_errno(j->unit, r, "Failed to insert job into jobs hash table: %m"); + + *pj = j; + j->installed = true; + + if (j->state == JOB_RUNNING) + j->unit->manager->n_running_jobs++; + + log_unit_debug(j->unit, + "Reinstalled deserialized job %s/%s as %u", + j->unit->id, job_type_to_string(j->type), (unsigned) j->id); + return 0; +} + +JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts) { + JobDependency *l; + + assert(object); + + /* Adds a new job link, which encodes that the 'subject' job + * needs the 'object' job in some way. If 'subject' is NULL + * this means the 'anchor' job (i.e. the one the user + * explicitly asked for) is the requester. */ + + l = new0(JobDependency, 1); + if (!l) + return NULL; + + l->subject = subject; + l->object = object; + l->matters = matters; + l->conflicts = conflicts; + + if (subject) + LIST_PREPEND(subject, subject->subject_list, l); + + LIST_PREPEND(object, object->object_list, l); + + return l; +} + +void job_dependency_free(JobDependency *l) { + assert(l); + + if (l->subject) + LIST_REMOVE(subject, l->subject->subject_list, l); + + LIST_REMOVE(object, l->object->object_list, l); + + free(l); +} + +void job_dump(Job *j, FILE *f, const char *prefix) { + assert(j); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%s-> Job %u:\n" + "%s\tAction: %s -> %s\n" + "%s\tState: %s\n" + "%s\tIrreversible: %s\n" + "%s\tMay GC: %s\n", + prefix, j->id, + prefix, j->unit->id, job_type_to_string(j->type), + prefix, job_state_to_string(j->state), + prefix, yes_no(j->irreversible), + prefix, yes_no(job_may_gc(j))); +} + +/* + * Merging is commutative, so imagine the matrix as symmetric. We store only + * its lower triangle to avoid duplication. We don't store the main diagonal, + * because A merged with A is simply A. + * + * If the resulting type is collapsed immediately afterwards (to get rid of + * the JOB_RELOAD_OR_START, which lies outside the lookup function's domain), + * the following properties hold: + * + * Merging is associative! A merged with B, and then merged with C is the same + * as A merged with the result of B merged with C. + * + * Mergeability is transitive! If A can be merged with B and B with C then + * A also with C. + * + * Also, if A merged with B cannot be merged with C, then either A or B cannot + * be merged with C either. + */ +static const JobType job_merging_table[] = { +/* What \ With * JOB_START JOB_VERIFY_ACTIVE JOB_STOP JOB_RELOAD */ +/*********************************************************************************/ +/*JOB_START */ +/*JOB_VERIFY_ACTIVE */ JOB_START, +/*JOB_STOP */ -1, -1, +/*JOB_RELOAD */ JOB_RELOAD_OR_START, JOB_RELOAD, -1, +/*JOB_RESTART */ JOB_RESTART, JOB_RESTART, -1, JOB_RESTART, +}; + +JobType job_type_lookup_merge(JobType a, JobType b) { + assert_cc(ELEMENTSOF(job_merging_table) == _JOB_TYPE_MAX_MERGING * (_JOB_TYPE_MAX_MERGING - 1) / 2); + assert(a >= 0 && a < _JOB_TYPE_MAX_MERGING); + assert(b >= 0 && b < _JOB_TYPE_MAX_MERGING); + + if (a == b) + return a; + + if (a < b) { + JobType tmp = a; + a = b; + b = tmp; + } + + return job_merging_table[(a - 1) * a / 2 + b]; +} + +bool job_type_is_redundant(JobType a, UnitActiveState b) { + switch (a) { + + case JOB_START: + return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING); + + case JOB_STOP: + return IN_SET(b, UNIT_INACTIVE, UNIT_FAILED); + + case JOB_VERIFY_ACTIVE: + return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING); + + case JOB_RELOAD: + return + b == UNIT_RELOADING; + + case JOB_RESTART: + return + b == UNIT_ACTIVATING; + + case JOB_NOP: + return true; + + default: + assert_not_reached("Invalid job type"); + } +} + +JobType job_type_collapse(JobType t, Unit *u) { + UnitActiveState s; + + switch (t) { + + case JOB_TRY_RESTART: + s = unit_active_state(u); + if (UNIT_IS_INACTIVE_OR_DEACTIVATING(s)) + return JOB_NOP; + + return JOB_RESTART; + + case JOB_TRY_RELOAD: + s = unit_active_state(u); + if (UNIT_IS_INACTIVE_OR_DEACTIVATING(s)) + return JOB_NOP; + + return JOB_RELOAD; + + case JOB_RELOAD_OR_START: + s = unit_active_state(u); + if (UNIT_IS_INACTIVE_OR_DEACTIVATING(s)) + return JOB_START; + + return JOB_RELOAD; + + default: + return t; + } +} + +int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u) { + JobType t; + + t = job_type_lookup_merge(*a, b); + if (t < 0) + return -EEXIST; + + *a = job_type_collapse(t, u); + return 0; +} + +static bool job_is_runnable(Job *j) { + Iterator i; + Unit *other; + void *v; + + assert(j); + assert(j->installed); + + /* Checks whether there is any job running for the units this + * job needs to be running after (in the case of a 'positive' + * job type) or before (in the case of a 'negative' job + * type. */ + + /* Note that unit types have a say in what is runnable, + * too. For example, if they return -EAGAIN from + * unit_start() they can indicate they are not + * runnable yet. */ + + /* First check if there is an override */ + if (j->ignore_order) + return true; + + if (j->type == JOB_NOP) + return true; + + if (IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE, JOB_RELOAD)) { + /* Immediate result is that the job is or might be + * started. In this case let's wait for the + * dependencies, regardless whether they are + * starting or stopping something. */ + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER], i) + if (other->job) + return false; + } + + /* Also, if something else is being stopped and we should + * change state after it, then let's wait. */ + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE], i) + if (other->job && + IN_SET(other->job->type, JOB_STOP, JOB_RESTART)) + return false; + + /* This means that for a service a and a service b where b + * shall be started after a: + * + * start a + start b → 1st step start a, 2nd step start b + * start a + stop b → 1st step stop b, 2nd step start a + * stop a + start b → 1st step stop a, 2nd step start b + * stop a + stop b → 1st step stop b, 2nd step stop a + * + * This has the side effect that restarts are properly + * synchronized too. */ + + return true; +} + +static void job_change_type(Job *j, JobType newtype) { + assert(j); + + log_unit_debug(j->unit, + "Converting job %s/%s -> %s/%s", + j->unit->id, job_type_to_string(j->type), + j->unit->id, job_type_to_string(newtype)); + + j->type = newtype; +} + +_pure_ static const char* job_get_begin_status_message_format(Unit *u, JobType t) { + const char *format; + + assert(u); + + if (t == JOB_RELOAD) + return "Reloading %s."; + + assert(IN_SET(t, JOB_START, JOB_STOP)); + + format = UNIT_VTABLE(u)->status_message_formats.starting_stopping[t == JOB_STOP]; + if (format) + return format; + + /* Return generic strings */ + if (t == JOB_START) + return "Starting %s."; + else { + assert(t == JOB_STOP); + return "Stopping %s."; + } +} + +static void job_print_begin_status_message(Unit *u, JobType t) { + const char *format; + + assert(u); + + /* Reload status messages have traditionally not been printed to console. */ + if (!IN_SET(t, JOB_START, JOB_STOP)) + return; + + format = job_get_begin_status_message_format(u, t); + + DISABLE_WARNING_FORMAT_NONLITERAL; + unit_status_printf(u, "", format); + REENABLE_WARNING; +} + +static void job_log_begin_status_message(Unit *u, uint32_t job_id, JobType t) { + const char *format, *mid; + char buf[LINE_MAX]; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (!IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD)) + return; + + if (log_on_console()) /* Skip this if it would only go on the console anyway */ + return; + + /* We log status messages for all units and all operations. */ + + format = job_get_begin_status_message_format(u, t); + + DISABLE_WARNING_FORMAT_NONLITERAL; + (void) snprintf(buf, sizeof buf, format, unit_description(u)); + REENABLE_WARNING; + + mid = t == JOB_START ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTING_STR : + t == JOB_STOP ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPING_STR : + "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADING_STR; + + /* Note that we deliberately use LOG_MESSAGE() instead of + * LOG_UNIT_MESSAGE() here, since this is supposed to mimic + * closely what is written to screen using the status output, + * which is supposed the highest level, friendliest output + * possible, which means we should avoid the low-level unit + * name. */ + log_struct(LOG_INFO, + LOG_MESSAGE("%s", buf), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + mid); +} + +static void job_emit_begin_status_message(Unit *u, uint32_t job_id, JobType t) { + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + job_log_begin_status_message(u, job_id, t); + job_print_begin_status_message(u, t); +} + +static int job_perform_on_unit(Job **j) { + uint32_t id; + Manager *m; + JobType t; + Unit *u; + int r; + + /* While we execute this operation the job might go away (for + * example: because it finishes immediately or is replaced by + * a new, conflicting job.) To make sure we don't access a + * freed job later on we store the id here, so that we can + * verify the job is still valid. */ + + assert(j); + assert(*j); + + m = (*j)->manager; + u = (*j)->unit; + t = (*j)->type; + id = (*j)->id; + + switch (t) { + case JOB_START: + r = unit_start(u); + break; + + case JOB_RESTART: + t = JOB_STOP; + _fallthrough_; + case JOB_STOP: + r = unit_stop(u); + break; + + case JOB_RELOAD: + r = unit_reload(u); + break; + + default: + assert_not_reached("Invalid job type"); + } + + /* Log if the job still exists and the start/stop/reload function actually did something. Note that this means + * for units for which there's no 'activating' phase (i.e. because we transition directly from 'inactive' to + * 'active') we'll possibly skip the "Starting..." message. */ + *j = manager_get_job(m, id); + if (*j && r > 0) + job_emit_begin_status_message(u, id, t); + + return r; +} + +int job_run_and_invalidate(Job *j) { + int r; + + assert(j); + assert(j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(j->in_run_queue); + + LIST_REMOVE(run_queue, j->manager->run_queue, j); + j->in_run_queue = false; + + if (j->state != JOB_WAITING) + return 0; + + if (!job_is_runnable(j)) + return -EAGAIN; + + job_start_timer(j, true); + job_set_state(j, JOB_RUNNING); + job_add_to_dbus_queue(j); + + switch (j->type) { + + case JOB_VERIFY_ACTIVE: { + UnitActiveState t; + + t = unit_active_state(j->unit); + if (UNIT_IS_ACTIVE_OR_RELOADING(t)) + r = -EALREADY; + else if (t == UNIT_ACTIVATING) + r = -EAGAIN; + else + r = -EBADR; + break; + } + + case JOB_START: + case JOB_STOP: + case JOB_RESTART: + r = job_perform_on_unit(&j); + + /* If the unit type does not support starting/stopping, then simply wait. */ + if (r == -EBADR) + r = 0; + break; + + case JOB_RELOAD: + r = job_perform_on_unit(&j); + break; + + case JOB_NOP: + r = -EALREADY; + break; + + default: + assert_not_reached("Unknown job type"); + } + + if (j) { + if (r == -EAGAIN) + job_set_state(j, JOB_WAITING); /* Hmm, not ready after all, let's return to JOB_WAITING state */ + else if (r == -EALREADY) /* already being executed */ + r = job_finish_and_invalidate(j, JOB_DONE, true, true); + else if (r == -ECOMM) /* condition failed, but all is good */ + r = job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (r == -EBADR) + r = job_finish_and_invalidate(j, JOB_SKIPPED, true, false); + else if (r == -ENOEXEC) + r = job_finish_and_invalidate(j, JOB_INVALID, true, false); + else if (r == -EPROTO) + r = job_finish_and_invalidate(j, JOB_ASSERT, true, false); + else if (r == -EOPNOTSUPP) + r = job_finish_and_invalidate(j, JOB_UNSUPPORTED, true, false); + else if (r == -ENOLINK) + r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); + else if (r == -ESTALE) + r = job_finish_and_invalidate(j, JOB_ONCE, true, false); + else if (r < 0) + r = job_finish_and_invalidate(j, JOB_FAILED, true, false); + } + + return r; +} + +_pure_ static const char *job_get_done_status_message_format(Unit *u, JobType t, JobResult result) { + + static const char *const generic_finished_start_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Started %s.", + [JOB_TIMEOUT] = "Timed out starting %s.", + [JOB_FAILED] = "Failed to start %s.", + [JOB_DEPENDENCY] = "Dependency failed for %s.", + [JOB_ASSERT] = "Assertion failed for %s.", + [JOB_UNSUPPORTED] = "Starting of %s not supported.", + [JOB_COLLECTED] = "Unnecessary job for %s was removed.", + [JOB_ONCE] = "Unit %s has been started before and cannot be started again." + }; + static const char *const generic_finished_stop_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Stopped %s.", + [JOB_FAILED] = "Stopped (with error) %s.", + [JOB_TIMEOUT] = "Timed out stopping %s.", + }; + static const char *const generic_finished_reload_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Reloaded %s.", + [JOB_FAILED] = "Reload failed for %s.", + [JOB_TIMEOUT] = "Timed out reloading %s.", + }; + /* When verify-active detects the unit is inactive, report it. + * Most likely a DEPEND warning from a requisiting unit will + * occur next and it's nice to see what was requisited. */ + static const char *const generic_finished_verify_active_job[_JOB_RESULT_MAX] = { + [JOB_SKIPPED] = "%s is not active.", + }; + + const char *format; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (IN_SET(t, JOB_START, JOB_STOP, JOB_RESTART)) { + format = t == JOB_START ? + UNIT_VTABLE(u)->status_message_formats.finished_start_job[result] : + UNIT_VTABLE(u)->status_message_formats.finished_stop_job[result]; + if (format) + return format; + } + + /* Return generic strings */ + if (t == JOB_START) + return generic_finished_start_job[result]; + else if (IN_SET(t, JOB_STOP, JOB_RESTART)) + return generic_finished_stop_job[result]; + else if (t == JOB_RELOAD) + return generic_finished_reload_job[result]; + else if (t == JOB_VERIFY_ACTIVE) + return generic_finished_verify_active_job[result]; + + return NULL; +} + +static const struct { + const char *color, *word; +} job_print_done_status_messages[_JOB_RESULT_MAX] = { + [JOB_DONE] = { ANSI_OK_COLOR, " OK " }, + [JOB_TIMEOUT] = { ANSI_HIGHLIGHT_RED, " TIME " }, + [JOB_FAILED] = { ANSI_HIGHLIGHT_RED, "FAILED" }, + [JOB_DEPENDENCY] = { ANSI_HIGHLIGHT_YELLOW, "DEPEND" }, + [JOB_SKIPPED] = { ANSI_HIGHLIGHT, " INFO " }, + [JOB_ASSERT] = { ANSI_HIGHLIGHT_YELLOW, "ASSERT" }, + [JOB_UNSUPPORTED] = { ANSI_HIGHLIGHT_YELLOW, "UNSUPP" }, + /* JOB_COLLECTED */ + [JOB_ONCE] = { ANSI_HIGHLIGHT_RED, " ONCE " }, +}; + +static void job_print_done_status_message(Unit *u, JobType t, JobResult result) { + const char *format; + const char *status; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + /* Reload status messages have traditionally not been printed to console. */ + if (t == JOB_RELOAD) + return; + + /* No message if the job did not actually do anything due to failed condition. */ + if (t == JOB_START && result == JOB_DONE && !u->condition_result) + return; + + if (!job_print_done_status_messages[result].word) + return; + + format = job_get_done_status_message_format(u, t, result); + if (!format) + return; + + if (log_get_show_color()) + status = strjoina(job_print_done_status_messages[result].color, + job_print_done_status_messages[result].word, + ANSI_NORMAL); + else + status = job_print_done_status_messages[result].word; + + if (result != JOB_DONE) + manager_flip_auto_status(u->manager, true); + + DISABLE_WARNING_FORMAT_NONLITERAL; + unit_status_printf(u, status, format); + REENABLE_WARNING; + + if (t == JOB_START && result == JOB_FAILED) { + _cleanup_free_ char *quoted; + + quoted = shell_maybe_quote(u->id, ESCAPE_BACKSLASH); + manager_status_printf(u->manager, STATUS_TYPE_NORMAL, NULL, "See 'systemctl status %s' for details.", strna(quoted)); + } +} + +static void job_log_done_status_message(Unit *u, uint32_t job_id, JobType t, JobResult result) { + const char *format, *mid; + char buf[LINE_MAX]; + static const int job_result_log_level[_JOB_RESULT_MAX] = { + [JOB_DONE] = LOG_INFO, + [JOB_CANCELED] = LOG_INFO, + [JOB_TIMEOUT] = LOG_ERR, + [JOB_FAILED] = LOG_ERR, + [JOB_DEPENDENCY] = LOG_WARNING, + [JOB_SKIPPED] = LOG_NOTICE, + [JOB_INVALID] = LOG_INFO, + [JOB_ASSERT] = LOG_WARNING, + [JOB_UNSUPPORTED] = LOG_WARNING, + [JOB_COLLECTED] = LOG_INFO, + [JOB_ONCE] = LOG_ERR, + }; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + /* Skip printing if output goes to the console, and job_print_status_message() + will actually print something to the console. */ + if (log_on_console() && job_print_done_status_messages[result].word) + return; + + /* Show condition check message if the job did not actually do anything due to failed condition. */ + if (t == JOB_START && result == JOB_DONE && !u->condition_result) { + log_struct(LOG_INFO, + "MESSAGE=Condition check resulted in %s being skipped.", unit_description(u), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR); + + return; + } + + format = job_get_done_status_message_format(u, t, result); + if (!format) + return; + + /* The description might be longer than the buffer, but that's OK, + * we'll just truncate it here. Note that we use snprintf() rather than + * xsprintf() on purpose here: we are fine with truncation and don't + * consider that an error. */ + DISABLE_WARNING_FORMAT_NONLITERAL; + (void) snprintf(buf, sizeof(buf), format, unit_description(u)); + REENABLE_WARNING; + + switch (t) { + + case JOB_START: + if (result == JOB_DONE) + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR; + else + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILED_STR; + break; + + case JOB_RELOAD: + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADED_STR; + break; + + case JOB_STOP: + case JOB_RESTART: + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPED_STR; + break; + + default: + log_struct(job_result_log_level[result], + LOG_MESSAGE("%s", buf), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u)); + return; + } + + log_struct(job_result_log_level[result], + LOG_MESSAGE("%s", buf), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + mid); +} + +static void job_emit_done_status_message(Unit *u, uint32_t job_id, JobType t, JobResult result) { + assert(u); + + job_log_done_status_message(u, job_id, t, result); + job_print_done_status_message(u, t, result); +} + +static void job_fail_dependencies(Unit *u, UnitDependency d) { + Unit *other; + Iterator i; + void *v; + + assert(u); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[d], i) { + Job *j = other->job; + + if (!j) + continue; + if (!IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE)) + continue; + + job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); + } +} + +int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) { + Unit *u; + Unit *other; + JobType t; + Iterator i; + void *v; + + assert(j); + assert(j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + + u = j->unit; + t = j->type; + + j->result = result; + + log_unit_debug(u, "Job %" PRIu32 " %s/%s finished, result=%s", j->id, u->id, job_type_to_string(t), job_result_to_string(result)); + + /* If this job did nothing to respective unit we don't log the status message */ + if (!already) + job_emit_done_status_message(u, j->id, t, result); + + /* Patch restart jobs so that they become normal start jobs */ + if (result == JOB_DONE && t == JOB_RESTART) { + + job_change_type(j, JOB_START); + job_set_state(j, JOB_WAITING); + + job_add_to_dbus_queue(j); + job_add_to_run_queue(j); + job_add_to_gc_queue(j); + + goto finish; + } + + if (IN_SET(result, JOB_FAILED, JOB_INVALID)) + j->manager->n_failed_jobs++; + + job_uninstall(j); + job_free(j); + + /* Fail depending jobs on failure */ + if (result != JOB_DONE && recursive) { + if (IN_SET(t, JOB_START, JOB_VERIFY_ACTIVE)) { + job_fail_dependencies(u, UNIT_REQUIRED_BY); + job_fail_dependencies(u, UNIT_REQUISITE_OF); + job_fail_dependencies(u, UNIT_BOUND_BY); + } else if (t == JOB_STOP) + job_fail_dependencies(u, UNIT_CONFLICTED_BY); + } + + /* Trigger OnFailure dependencies that are not generated by + * the unit itself. We don't treat JOB_CANCELED as failure in + * this context. And JOB_FAILURE is already handled by the + * unit itself. */ + if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) { + log_struct(LOG_NOTICE, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_MESSAGE(u, "Job %s/%s failed with result '%s'.", + u->id, + job_type_to_string(t), + job_result_to_string(result))); + + unit_start_on_failure(u); + } + + unit_trigger_notify(u); + +finish: + /* Try to start the next jobs that can be started */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_AFTER], i) + if (other->job) { + job_add_to_run_queue(other->job); + job_add_to_gc_queue(other->job); + } + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BEFORE], i) + if (other->job) { + job_add_to_run_queue(other->job); + job_add_to_gc_queue(other->job); + } + + manager_check_finished(u->manager); + + return 0; +} + +static int job_dispatch_timer(sd_event_source *s, uint64_t monotonic, void *userdata) { + Job *j = userdata; + Unit *u; + + assert(j); + assert(s == j->timer_event_source); + + log_unit_warning(j->unit, "Job %s/%s timed out.", j->unit->id, job_type_to_string(j->type)); + + u = j->unit; + job_finish_and_invalidate(j, JOB_TIMEOUT, true, false); + + emergency_action(u->manager, u->job_timeout_action, + EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN, + u->job_timeout_reboot_arg, -1, "job timed out"); + + return 0; +} + +int job_start_timer(Job *j, bool job_running) { + int r; + usec_t timeout_time, old_timeout_time; + + if (job_running) { + j->begin_running_usec = now(CLOCK_MONOTONIC); + + if (j->unit->job_running_timeout == USEC_INFINITY) + return 0; + + timeout_time = usec_add(j->begin_running_usec, j->unit->job_running_timeout); + + if (j->timer_event_source) { + /* Update only if JobRunningTimeoutSec= results in earlier timeout */ + r = sd_event_source_get_time(j->timer_event_source, &old_timeout_time); + if (r < 0) + return r; + + if (old_timeout_time <= timeout_time) + return 0; + + return sd_event_source_set_time(j->timer_event_source, timeout_time); + } + } else { + if (j->timer_event_source) + return 0; + + j->begin_usec = now(CLOCK_MONOTONIC); + + if (j->unit->job_timeout == USEC_INFINITY) + return 0; + + timeout_time = usec_add(j->begin_usec, j->unit->job_timeout); + } + + r = sd_event_add_time( + j->manager->event, + &j->timer_event_source, + CLOCK_MONOTONIC, + timeout_time, 0, + job_dispatch_timer, j); + if (r < 0) + return r; + + (void) sd_event_source_set_description(j->timer_event_source, "job-start"); + + return 0; +} + +void job_add_to_run_queue(Job *j) { + int r; + + assert(j); + assert(j->installed); + + if (j->in_run_queue) + return; + + if (!j->manager->run_queue) { + r = sd_event_source_set_enabled(j->manager->run_queue_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_warning_errno(r, "Failed to enable job run queue event source, ignoring: %m"); + } + + LIST_PREPEND(run_queue, j->manager->run_queue, j); + j->in_run_queue = true; +} + +void job_add_to_dbus_queue(Job *j) { + assert(j); + assert(j->installed); + + if (j->in_dbus_queue) + return; + + /* We don't check if anybody is subscribed here, since this + * job might just have been created and not yet assigned to a + * connection/client. */ + + LIST_PREPEND(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = true; +} + +char *job_dbus_path(Job *j) { + char *p; + + assert(j); + + if (asprintf(&p, "/org/freedesktop/systemd1/job/%"PRIu32, j->id) < 0) + return NULL; + + return p; +} + +int job_serialize(Job *j, FILE *f) { + assert(j); + assert(f); + + (void) serialize_item_format(f, "job-id", "%u", j->id); + (void) serialize_item(f, "job-type", job_type_to_string(j->type)); + (void) serialize_item(f, "job-state", job_state_to_string(j->state)); + (void) serialize_bool(f, "job-irreversible", j->irreversible); + (void) serialize_bool(f, "job-sent-dbus-new-signal", j->sent_dbus_new_signal); + (void) serialize_bool(f, "job-ignore-order", j->ignore_order); + + if (j->begin_usec > 0) + (void) serialize_usec(f, "job-begin", j->begin_usec); + if (j->begin_running_usec > 0) + (void) serialize_usec(f, "job-begin-running", j->begin_running_usec); + + bus_track_serialize(j->bus_track, f, "subscribed"); + + /* End marker */ + fputc('\n', f); + return 0; +} + +int job_deserialize(Job *j, FILE *f) { + int r; + + assert(j); + assert(f); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l, *v; + size_t k; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + return 0; + + l = strstrip(line); + + /* End marker */ + if (isempty(l)) + return 0; + + k = strcspn(l, "="); + + if (l[k] == '=') { + l[k] = 0; + v = l+k+1; + } else + v = l+k; + + if (streq(l, "job-id")) { + + if (safe_atou32(v, &j->id) < 0) + log_debug("Failed to parse job id value: %s", v); + + } else if (streq(l, "job-type")) { + JobType t; + + t = job_type_from_string(v); + if (t < 0) + log_debug("Failed to parse job type: %s", v); + else if (t >= _JOB_TYPE_MAX_IN_TRANSACTION) + log_debug("Cannot deserialize job of type: %s", v); + else + j->type = t; + + } else if (streq(l, "job-state")) { + JobState s; + + s = job_state_from_string(v); + if (s < 0) + log_debug("Failed to parse job state: %s", v); + else + job_set_state(j, s); + + } else if (streq(l, "job-irreversible")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job irreversible flag: %s", v); + else + j->irreversible = j->irreversible || b; + + } else if (streq(l, "job-sent-dbus-new-signal")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job sent_dbus_new_signal flag: %s", v); + else + j->sent_dbus_new_signal = j->sent_dbus_new_signal || b; + + } else if (streq(l, "job-ignore-order")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job ignore_order flag: %s", v); + else + j->ignore_order = j->ignore_order || b; + + } else if (streq(l, "job-begin")) + (void) deserialize_usec(v, &j->begin_usec); + + else if (streq(l, "job-begin-running")) + (void) deserialize_usec(v, &j->begin_running_usec); + + else if (streq(l, "subscribed")) { + if (strv_extend(&j->deserialized_clients, v) < 0) + return log_oom(); + } else + log_debug("Unknown job serialization key: %s", l); + } +} + +int job_coldplug(Job *j) { + int r; + usec_t timeout_time = USEC_INFINITY; + + assert(j); + + /* After deserialization is complete and the bus connection + * set up again, let's start watching our subscribers again */ + (void) bus_job_coldplug_bus_track(j); + + if (j->state == JOB_WAITING) + job_add_to_run_queue(j); + + /* Maybe due to new dependencies we don't actually need this job anymore? */ + job_add_to_gc_queue(j); + + /* Create timer only when job began or began running and the respective timeout is finite. + * Follow logic of job_start_timer() if both timeouts are finite */ + if (j->begin_usec == 0) + return 0; + + if (j->unit->job_timeout != USEC_INFINITY) + timeout_time = usec_add(j->begin_usec, j->unit->job_timeout); + + if (j->begin_running_usec > 0 && j->unit->job_running_timeout != USEC_INFINITY) + timeout_time = MIN(timeout_time, usec_add(j->begin_running_usec, j->unit->job_running_timeout)); + + if (timeout_time == USEC_INFINITY) + return 0; + + j->timer_event_source = sd_event_source_unref(j->timer_event_source); + + r = sd_event_add_time( + j->manager->event, + &j->timer_event_source, + CLOCK_MONOTONIC, + timeout_time, 0, + job_dispatch_timer, j); + if (r < 0) + log_debug_errno(r, "Failed to restart timeout for job: %m"); + + (void) sd_event_source_set_description(j->timer_event_source, "job-timeout"); + + return r; +} + +void job_shutdown_magic(Job *j) { + assert(j); + + /* The shutdown target gets some special treatment here: we + * tell the kernel to begin with flushing its disk caches, to + * optimize shutdown time a bit. Ideally we wouldn't hardcode + * this magic into PID 1. However all other processes aren't + * options either since they'd exit much sooner than PID 1 and + * asynchronous sync() would cause their exit to be + * delayed. */ + + if (j->type != JOB_START) + return; + + if (!MANAGER_IS_SYSTEM(j->unit->manager)) + return; + + if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET)) + return; + + /* In case messages on console has been disabled on boot */ + j->unit->manager->no_console_output = false; + + if (detect_container() > 0) + return; + + (void) asynchronous_sync(NULL); +} + +int job_get_timeout(Job *j, usec_t *timeout) { + usec_t x = USEC_INFINITY, y = USEC_INFINITY; + Unit *u = j->unit; + int r; + + assert(u); + + if (j->timer_event_source) { + r = sd_event_source_get_time(j->timer_event_source, &x); + if (r < 0) + return r; + } + + if (UNIT_VTABLE(u)->get_timeout) { + r = UNIT_VTABLE(u)->get_timeout(u, &y); + if (r < 0) + return r; + } + + if (x == USEC_INFINITY && y == USEC_INFINITY) + return 0; + + *timeout = MIN(x, y); + return 1; +} + +bool job_may_gc(Job *j) { + Unit *other; + Iterator i; + void *v; + + assert(j); + + /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their + * own and just track external state. For now the only unit type that qualifies for this are .device units. + * Returns true if the job can be collected. */ + + if (!UNIT_VTABLE(j->unit)->gc_jobs) + return false; + + if (sd_bus_track_count(j->bus_track) > 0) + return false; + + /* FIXME: So this is a bit ugly: for now we don't properly track references made via private bus connections + * (because it's nasty, as sd_bus_track doesn't apply to it). We simply remember that the job was once + * referenced by one, and reset this whenever we notice that no private bus connections are around. This means + * the GC is a bit too conservative when it comes to jobs created by private bus connections. */ + if (j->ref_by_private_bus) { + if (set_isempty(j->unit->manager->private_buses)) + j->ref_by_private_bus = false; + else + return false; + } + + if (j->type == JOB_NOP) + return false; + + /* If a job is ordered after ours, and is to be started, then it needs to wait for us, regardless if we stop or + * start, hence let's not GC in that case. */ + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE], i) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (IN_SET(other->job->type, JOB_START, JOB_VERIFY_ACTIVE, JOB_RELOAD)) + return false; + } + + /* If we are going down, but something else is ordered After= us, then it needs to wait for us */ + if (IN_SET(j->type, JOB_STOP, JOB_RESTART)) + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER], i) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + return false; + } + + /* The logic above is kinda the inverse of the job_is_runnable() logic. Specifically, if the job "we" is + * ordered before the job "other": + * + * we start + other start → stay + * we start + other stop → gc + * we stop + other start → stay + * we stop + other stop → gc + * + * "we" are ordered after "other": + * + * we start + other start → gc + * we start + other stop → gc + * we stop + other start → stay + * we stop + other stop → stay + */ + + return true; +} + +void job_add_to_gc_queue(Job *j) { + assert(j); + + if (j->in_gc_queue) + return; + + if (!job_may_gc(j)) + return; + + LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j); + j->in_gc_queue = true; +} + +static int job_compare(Job * const *a, Job * const *b) { + return CMP((*a)->id, (*b)->id); +} + +static size_t sort_job_list(Job **list, size_t n) { + Job *previous = NULL; + size_t a, b; + + /* Order by numeric IDs */ + typesafe_qsort(list, n, job_compare); + + /* Filter out duplicates */ + for (a = 0, b = 0; a < n; a++) { + + if (previous == list[a]) + continue; + + previous = list[b++] = list[a]; + } + + return b; +} + +int job_get_before(Job *j, Job*** ret) { + _cleanup_free_ Job** list = NULL; + size_t n = 0, n_allocated = 0; + Unit *other = NULL; + Iterator i; + void *v; + + /* Returns a list of all pending jobs that need to finish before this job may be started. */ + + assert(j); + assert(ret); + + if (j->ignore_order) { + *ret = NULL; + return 0; + } + + if (IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE, JOB_RELOAD)) { + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER], i) { + if (!other->job) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + } + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE], i) { + if (!other->job) + continue; + + if (!IN_SET(other->job->type, JOB_STOP, JOB_RESTART)) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + n = sort_job_list(list, n); + + *ret = TAKE_PTR(list); + + return (int) n; +} + +int job_get_after(Job *j, Job*** ret) { + _cleanup_free_ Job** list = NULL; + size_t n = 0, n_allocated = 0; + Unit *other = NULL; + void *v; + Iterator i; + + assert(j); + assert(ret); + + /* Returns a list of all pending jobs that are waiting for this job to finish. */ + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE], i) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (!IN_SET(other->job->type, JOB_START, JOB_VERIFY_ACTIVE, JOB_RELOAD)) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + if (IN_SET(j->type, JOB_STOP, JOB_RESTART)) { + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER], i) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + } + + n = sort_job_list(list, n); + + *ret = TAKE_PTR(list); + + return (int) n; +} + +static const char* const job_state_table[_JOB_STATE_MAX] = { + [JOB_WAITING] = "waiting", + [JOB_RUNNING] = "running", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_state, JobState); + +static const char* const job_type_table[_JOB_TYPE_MAX] = { + [JOB_START] = "start", + [JOB_VERIFY_ACTIVE] = "verify-active", + [JOB_STOP] = "stop", + [JOB_RELOAD] = "reload", + [JOB_RELOAD_OR_START] = "reload-or-start", + [JOB_RESTART] = "restart", + [JOB_TRY_RESTART] = "try-restart", + [JOB_TRY_RELOAD] = "try-reload", + [JOB_NOP] = "nop", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_type, JobType); + +static const char* const job_mode_table[_JOB_MODE_MAX] = { + [JOB_FAIL] = "fail", + [JOB_REPLACE] = "replace", + [JOB_REPLACE_IRREVERSIBLY] = "replace-irreversibly", + [JOB_ISOLATE] = "isolate", + [JOB_FLUSH] = "flush", + [JOB_IGNORE_DEPENDENCIES] = "ignore-dependencies", + [JOB_IGNORE_REQUIREMENTS] = "ignore-requirements", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode); + +static const char* const job_result_table[_JOB_RESULT_MAX] = { + [JOB_DONE] = "done", + [JOB_CANCELED] = "canceled", + [JOB_TIMEOUT] = "timeout", + [JOB_FAILED] = "failed", + [JOB_DEPENDENCY] = "dependency", + [JOB_SKIPPED] = "skipped", + [JOB_INVALID] = "invalid", + [JOB_ASSERT] = "assert", + [JOB_UNSUPPORTED] = "unsupported", + [JOB_COLLECTED] = "collected", + [JOB_ONCE] = "once", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult); + +const char* job_type_to_access_method(JobType t) { + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (IN_SET(t, JOB_START, JOB_RESTART, JOB_TRY_RESTART)) + return "start"; + else if (t == JOB_STOP) + return "stop"; + else + return "reload"; +} diff --git a/src/core/job.h b/src/core/job.h new file mode 100644 index 0000000..1b9bcdd --- /dev/null +++ b/src/core/job.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> + +#include "sd-event.h" + +#include "list.h" +#include "unit-name.h" + +typedef struct Job Job; +typedef struct JobDependency JobDependency; +typedef enum JobType JobType; +typedef enum JobState JobState; +typedef enum JobMode JobMode; +typedef enum JobResult JobResult; + +/* Be careful when changing the job types! Adjust job_merging_table[] accordingly! */ +enum JobType { + JOB_START, /* if a unit does not support being started, we'll just wait until it becomes active */ + JOB_VERIFY_ACTIVE, + + JOB_STOP, + + JOB_RELOAD, /* if running, reload */ + + /* Note that restarts are first treated like JOB_STOP, but + * then instead of finishing are patched to become + * JOB_START. */ + JOB_RESTART, /* If running, stop. Then start unconditionally. */ + + _JOB_TYPE_MAX_MERGING, + + /* JOB_NOP can enter into a transaction, but as it won't pull in + * any dependencies and it uses the special 'nop_job' slot in Unit, + * it won't have to merge with anything (except possibly into another + * JOB_NOP, previously installed). JOB_NOP is special-cased in + * job_type_is_*() functions so that the transaction can be + * activated. */ + JOB_NOP = _JOB_TYPE_MAX_MERGING, /* do nothing */ + + _JOB_TYPE_MAX_IN_TRANSACTION, + + /* JOB_TRY_RESTART can never appear in a transaction, because + * it always collapses into JOB_RESTART or JOB_NOP before entering. + * Thus we never need to merge it with anything. */ + JOB_TRY_RESTART = _JOB_TYPE_MAX_IN_TRANSACTION, /* if running, stop and then start */ + + /* Similar to JOB_TRY_RESTART but collapses to JOB_RELOAD or JOB_NOP */ + JOB_TRY_RELOAD, + + /* JOB_RELOAD_OR_START won't enter into a transaction and cannot result + * from transaction merging (there's no way for JOB_RELOAD and + * JOB_START to meet in one transaction). It can result from a merge + * during job installation, but then it will immediately collapse into + * one of the two simpler types. */ + JOB_RELOAD_OR_START, /* if running, reload, otherwise start */ + + _JOB_TYPE_MAX, + _JOB_TYPE_INVALID = -1 +}; + +enum JobState { + JOB_WAITING, + JOB_RUNNING, + _JOB_STATE_MAX, + _JOB_STATE_INVALID = -1 +}; + +enum JobMode { + JOB_FAIL, /* Fail if a conflicting job is already queued */ + JOB_REPLACE, /* Replace an existing conflicting job */ + JOB_REPLACE_IRREVERSIBLY,/* Like JOB_REPLACE + produce irreversible jobs */ + JOB_ISOLATE, /* Start a unit, and stop all others */ + JOB_FLUSH, /* Flush out all other queued jobs when queing this one */ + JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */ + JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */ + _JOB_MODE_MAX, + _JOB_MODE_INVALID = -1 +}; + +enum JobResult { + JOB_DONE, /* Job completed successfully (or skipped due to a failed ConditionXYZ=) */ + JOB_CANCELED, /* Job canceled by a conflicting job installation or by explicit cancel request */ + JOB_TIMEOUT, /* Job timeout elapsed */ + JOB_FAILED, /* Job failed */ + JOB_DEPENDENCY, /* A required dependency job did not result in JOB_DONE */ + JOB_SKIPPED, /* Negative result of JOB_VERIFY_ACTIVE */ + JOB_INVALID, /* JOB_RELOAD of inactive unit */ + JOB_ASSERT, /* Couldn't start a unit, because an assert didn't hold */ + JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */ + JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */ + JOB_ONCE, /* Unit was started before, and hence can't be started again */ + _JOB_RESULT_MAX, + _JOB_RESULT_INVALID = -1 +}; + +#include "unit.h" + +struct JobDependency { + /* Encodes that the 'subject' job needs the 'object' job in + * some way. This structure is used only while building a transaction. */ + Job *subject; + Job *object; + + LIST_FIELDS(JobDependency, subject); + LIST_FIELDS(JobDependency, object); + + bool matters:1; + bool conflicts:1; +}; + +struct Job { + Manager *manager; + Unit *unit; + + LIST_FIELDS(Job, transaction); + LIST_FIELDS(Job, run_queue); + LIST_FIELDS(Job, dbus_queue); + LIST_FIELDS(Job, gc_queue); + + LIST_HEAD(JobDependency, subject_list); + LIST_HEAD(JobDependency, object_list); + + /* Used for graph algs as a "I have been here" marker */ + Job* marker; + unsigned generation; + + uint32_t id; + + JobType type; + JobState state; + + sd_event_source *timer_event_source; + usec_t begin_usec; + usec_t begin_running_usec; + + /* + * This tracks where to send signals, and also which clients + * are allowed to call DBus methods on the job (other than + * root). + * + * There can be more than one client, because of job merging. + */ + sd_bus_track *bus_track; + char **deserialized_clients; + + JobResult result; + + bool installed:1; + bool in_run_queue:1; + bool matters_to_anchor:1; + bool in_dbus_queue:1; + bool sent_dbus_new_signal:1; + bool ignore_order:1; + bool irreversible:1; + bool in_gc_queue:1; + bool ref_by_private_bus:1; +}; + +Job* job_new(Unit *unit, JobType type); +Job* job_new_raw(Unit *unit); +void job_unlink(Job *job); +Job* job_free(Job *job); +Job* job_install(Job *j); +int job_install_deserialized(Job *j); +void job_uninstall(Job *j); +void job_dump(Job *j, FILE *f, const char *prefix); +int job_serialize(Job *j, FILE *f); +int job_deserialize(Job *j, FILE *f); +int job_coldplug(Job *j); + +JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts); +void job_dependency_free(JobDependency *l); + +int job_merge(Job *j, Job *other); + +JobType job_type_lookup_merge(JobType a, JobType b) _pure_; + +_pure_ static inline bool job_type_is_mergeable(JobType a, JobType b) { + return job_type_lookup_merge(a, b) >= 0; +} + +_pure_ static inline bool job_type_is_conflicting(JobType a, JobType b) { + return a != JOB_NOP && b != JOB_NOP && !job_type_is_mergeable(a, b); +} + +_pure_ static inline bool job_type_is_superset(JobType a, JobType b) { + /* Checks whether operation a is a "superset" of b in its actions */ + if (b == JOB_NOP) + return true; + if (a == JOB_NOP) + return false; + return a == job_type_lookup_merge(a, b); +} + +bool job_type_is_redundant(JobType a, UnitActiveState b) _pure_; + +/* Collapses a state-dependent job type into a simpler type by observing + * the state of the unit which it is going to be applied to. */ +JobType job_type_collapse(JobType t, Unit *u); + +int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u); + +void job_add_to_run_queue(Job *j); +void job_add_to_dbus_queue(Job *j); + +int job_start_timer(Job *j, bool job_running); + +int job_run_and_invalidate(Job *j); +int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already); + +char *job_dbus_path(Job *j); + +void job_shutdown_magic(Job *j); + +int job_get_timeout(Job *j, usec_t *timeout) _pure_; + +bool job_may_gc(Job *j); +void job_add_to_gc_queue(Job *j); + +int job_get_before(Job *j, Job*** ret); +int job_get_after(Job *j, Job*** ret); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Job*, job_free); + +const char* job_type_to_string(JobType t) _const_; +JobType job_type_from_string(const char *s) _pure_; + +const char* job_state_to_string(JobState t) _const_; +JobState job_state_from_string(const char *s) _pure_; + +const char* job_mode_to_string(JobMode t) _const_; +JobMode job_mode_from_string(const char *s) _pure_; + +const char* job_result_to_string(JobResult t) _const_; +JobResult job_result_from_string(const char *s) _pure_; + +const char* job_type_to_access_method(JobType t); diff --git a/src/core/kill.c b/src/core/kill.c new file mode 100644 index 0000000..6fe96cf --- /dev/null +++ b/src/core/kill.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "kill.h" +#include "signal-util.h" +#include "string-table.h" +#include "util.h" + +void kill_context_init(KillContext *c) { + assert(c); + + c->kill_signal = SIGTERM; + c->final_kill_signal = SIGKILL; + c->send_sigkill = true; + c->send_sighup = false; + c->watchdog_signal = SIGABRT; +} + +void kill_context_dump(KillContext *c, FILE *f, const char *prefix) { + assert(c); + + prefix = strempty(prefix); + + fprintf(f, + "%sKillMode: %s\n" + "%sKillSignal: SIG%s\n" + "%sFinalKillSignal: SIG%s\n" + "%sSendSIGKILL: %s\n" + "%sSendSIGHUP: %s\n", + prefix, kill_mode_to_string(c->kill_mode), + prefix, signal_to_string(c->kill_signal), + prefix, signal_to_string(c->final_kill_signal), + prefix, yes_no(c->send_sigkill), + prefix, yes_no(c->send_sighup)); +} + +static const char* const kill_mode_table[_KILL_MODE_MAX] = { + [KILL_CONTROL_GROUP] = "control-group", + [KILL_PROCESS] = "process", + [KILL_MIXED] = "mixed", + [KILL_NONE] = "none" +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_mode, KillMode); + +static const char* const kill_who_table[_KILL_WHO_MAX] = { + [KILL_MAIN] = "main", + [KILL_CONTROL] = "control", + [KILL_ALL] = "all", + [KILL_MAIN_FAIL] = "main-fail", + [KILL_CONTROL_FAIL] = "control-fail", + [KILL_ALL_FAIL] = "all-fail" +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho); diff --git a/src/core/kill.h b/src/core/kill.h new file mode 100644 index 0000000..f3915be --- /dev/null +++ b/src/core/kill.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct KillContext KillContext; + +#include <stdbool.h> +#include <stdio.h> + +#include "macro.h" + +typedef enum KillMode { + /* The kill mode is a property of a unit. */ + KILL_CONTROL_GROUP = 0, + KILL_PROCESS, + KILL_MIXED, + KILL_NONE, + _KILL_MODE_MAX, + _KILL_MODE_INVALID = -1 +} KillMode; + +struct KillContext { + KillMode kill_mode; + int kill_signal; + int final_kill_signal; + bool send_sigkill; + bool send_sighup; + int watchdog_signal; +}; + +typedef enum KillWho { + /* Kill who is a property of an operation */ + KILL_MAIN, + KILL_CONTROL, + KILL_ALL, + KILL_MAIN_FAIL, + KILL_CONTROL_FAIL, + KILL_ALL_FAIL, + _KILL_WHO_MAX, + _KILL_WHO_INVALID = -1 +} KillWho; + +void kill_context_init(KillContext *c); +void kill_context_dump(KillContext *c, FILE *f, const char *prefix); + +const char *kill_mode_to_string(KillMode k) _const_; +KillMode kill_mode_from_string(const char *s) _pure_; + +const char *kill_who_to_string(KillWho k) _const_; +KillWho kill_who_from_string(const char *s) _pure_; diff --git a/src/core/killall.c b/src/core/killall.c new file mode 100644 index 0000000..f0ce996 --- /dev/null +++ b/src/core/killall.c @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include <errno.h> +#include <signal.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "def.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "killall.h" +#include "parse-util.h" +#include "process-util.h" +#include "set.h" +#include "string-util.h" +#include "terminal-util.h" +#include "util.h" + +static bool ignore_proc(pid_t pid, bool warn_rootfs) { + _cleanup_fclose_ FILE *f = NULL; + const char *p; + char c = 0; + uid_t uid; + int r; + + /* We are PID 1, let's not commit suicide */ + if (pid <= 1) + return true; + + /* Ignore kernel threads */ + r = is_kernel_thread(pid); + if (r != 0) + return true; /* also ignore processes where we can't determine this */ + + r = get_process_uid(pid, &uid); + if (r < 0) + return true; /* not really, but better safe than sorry */ + + /* Non-root processes otherwise are always subject to be killed */ + if (uid != 0) + return false; + + p = procfs_file_alloca(pid, "cmdline"); + f = fopen(p, "re"); + if (!f) + return true; /* not really, but has the desired effect */ + + /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for + * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as + * actual kernel threads are already filtered out above. */ + (void) fread(&c, 1, 1, f); + + /* Processes with argv[0][0] = '@' we ignore from the killing spree. + * + * http://www.freedesktop.org/wiki/Software/systemd/RootStorageDaemons */ + if (c != '@') + return false; + + if (warn_rootfs && + pid_from_same_root_fs(pid) == 0) { + + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + log_notice("Process " PID_FMT " (%s) has been marked to be excluded from killing. It is " + "running from the root file system, and thus likely to block re-mounting of the " + "root file system to read-only. Please consider moving it into an initrd file " + "system instead.", pid, strna(comm)); + } + + return true; +} + +static void wait_for_children(Set *pids, sigset_t *mask, usec_t timeout) { + usec_t until; + + assert(mask); + + if (set_isempty(pids)) + return; + + until = now(CLOCK_MONOTONIC) + timeout; + for (;;) { + struct timespec ts; + int k; + usec_t n; + void *p; + Iterator i; + + /* First, let the kernel inform us about killed + * children. Most processes will probably be our + * children, but some are not (might be our + * grandchildren instead...). */ + for (;;) { + pid_t pid; + + pid = waitpid(-1, NULL, WNOHANG); + if (pid == 0) + break; + if (pid < 0) { + if (errno == ECHILD) + break; + + log_error_errno(errno, "waitpid() failed: %m"); + return; + } + + (void) set_remove(pids, PID_TO_PTR(pid)); + } + + /* Now explicitly check who might be remaining, who + * might not be our child. */ + SET_FOREACH(p, pids, i) { + + /* kill(pid, 0) sends no signal, but it tells + * us whether the process still exists. */ + if (kill(PTR_TO_PID(p), 0) == 0) + continue; + + if (errno != ESRCH) + continue; + + set_remove(pids, p); + } + + if (set_isempty(pids)) + return; + + n = now(CLOCK_MONOTONIC); + if (n >= until) + return; + + timespec_store(&ts, until - n); + k = sigtimedwait(mask, NULL, &ts); + if (k != SIGCHLD) { + + if (k < 0 && errno != EAGAIN) { + log_error_errno(errno, "sigtimedwait() failed: %m"); + return; + } + + if (k >= 0) + log_warning("sigtimedwait() returned unexpected signal."); + } + } +} + +static int killall(int sig, Set *pids, bool send_sighup) { + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *d; + + dir = opendir("/proc"); + if (!dir) + return -errno; + + FOREACH_DIRENT_ALL(d, dir, break) { + pid_t pid; + int r; + + if (!IN_SET(d->d_type, DT_DIR, DT_UNKNOWN)) + continue; + + if (parse_pid(d->d_name, &pid) < 0) + continue; + + if (ignore_proc(pid, sig == SIGKILL && !in_initrd())) + continue; + + if (sig == SIGKILL) { + _cleanup_free_ char *s = NULL; + + get_process_comm(pid, &s); + log_notice("Sending SIGKILL to PID "PID_FMT" (%s).", pid, strna(s)); + } + + if (kill(pid, sig) >= 0) { + if (pids) { + r = set_put(pids, PID_TO_PTR(pid)); + if (r < 0) + log_oom(); + } + } else if (errno != ENOENT) + log_warning_errno(errno, "Could not kill %d: %m", pid); + + if (send_sighup) { + /* Optionally, also send a SIGHUP signal, but + only if the process has a controlling + tty. This is useful to allow handling of + shells which ignore SIGTERM but react to + SIGHUP. We do not send this to processes that + have no controlling TTY since we don't want to + trigger reloads of daemon processes. Also we + make sure to only send this after SIGTERM so + that SIGTERM is always first in the queue. */ + + if (get_ctty_devnr(pid, NULL) >= 0) + /* it's OK if the process is gone, just ignore the result */ + (void) kill(pid, SIGHUP); + } + } + + return set_size(pids); +} + +void broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout) { + sigset_t mask, oldmask; + _cleanup_set_free_ Set *pids = NULL; + + if (wait_for_exit) + pids = set_new(NULL); + + assert_se(sigemptyset(&mask) == 0); + assert_se(sigaddset(&mask, SIGCHLD) == 0); + assert_se(sigprocmask(SIG_BLOCK, &mask, &oldmask) == 0); + + if (kill(-1, SIGSTOP) < 0 && errno != ESRCH) + log_warning_errno(errno, "kill(-1, SIGSTOP) failed: %m"); + + killall(sig, pids, send_sighup); + + if (kill(-1, SIGCONT) < 0 && errno != ESRCH) + log_warning_errno(errno, "kill(-1, SIGCONT) failed: %m"); + + if (wait_for_exit) + wait_for_children(pids, &mask, timeout); + + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0); +} diff --git a/src/core/killall.h b/src/core/killall.h new file mode 100644 index 0000000..cbd2026 --- /dev/null +++ b/src/core/killall.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "time-util.h" + +void broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout); diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c new file mode 100644 index 0000000..a91cfeb --- /dev/null +++ b/src/core/kmod-setup.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <ftw.h> +#include <string.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bus-util.h" +#include "capability-util.h" +#include "fileio.h" +#include "kmod-setup.h" +#include "macro.h" +#include "string-util.h" + +#if HAVE_KMOD +#include <libkmod.h> +#include "module-util.h" + +static void systemd_kmod_log( + void *data, + int priority, + const char *file, int line, + const char *fn, + const char *format, + va_list args) { + + /* library logging is enabled at debug only */ + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_DEBUG, 0, file, line, fn, format, args); + REENABLE_WARNING; +} + +static int has_virtio_rng_nftw_cb( + const char *fpath, + const struct stat *sb, + int tflag, + struct FTW *ftwbuf) { + + _cleanup_free_ char *alias = NULL; + int r; + + if ((FTW_D == tflag) && (ftwbuf->level > 2)) + return FTW_SKIP_SUBTREE; + + if (FTW_F != tflag) + return FTW_CONTINUE; + + if (!endswith(fpath, "/modalias")) + return FTW_CONTINUE; + + r = read_one_line_file(fpath, &alias); + if (r < 0) + return FTW_SKIP_SIBLINGS; + + if (startswith(alias, "pci:v00001AF4d00001005")) + return FTW_STOP; + + if (startswith(alias, "pci:v00001AF4d00001044")) + return FTW_STOP; + + return FTW_SKIP_SIBLINGS; +} + +static bool has_virtio_rng(void) { + return (nftw("/sys/devices/pci0000:00", has_virtio_rng_nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL) == FTW_STOP); +} +#endif + +int kmod_setup(void) { +#if HAVE_KMOD + + static const struct { + const char *module; + const char *path; + bool warn_if_unavailable:1; + bool warn_if_module:1; + bool (*condition_fn)(void); + } kmod_table[] = { + /* This one we need to load explicitly, since auto-loading on use doesn't work + * before udev created the ghost device nodes, and we need it earlier than that. */ + { "autofs4", "/sys/class/misc/autofs", true, false, NULL }, + + /* This one we need to load explicitly, since auto-loading of IPv6 is not done when + * we try to configure ::1 on the loopback device. */ + { "ipv6", "/sys/module/ipv6", false, true, NULL }, + + /* This should never be a module */ + { "unix", "/proc/net/unix", true, true, NULL }, + +#if HAVE_LIBIPTC + /* netfilter is needed by networkd, nspawn among others, and cannot be autoloaded */ + { "ip_tables", "/proc/net/ip_tables_names", false, false, NULL }, +#endif + /* virtio_rng would be loaded by udev later, but real entropy might be needed very early */ + { "virtio_rng", NULL, false, false, has_virtio_rng }, + }; + _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL; + unsigned i; + + if (have_effective_cap(CAP_SYS_MODULE) == 0) + return 0; + + for (i = 0; i < ELEMENTSOF(kmod_table); i++) { + if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0) + continue; + + if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn()) + continue; + + if (kmod_table[i].warn_if_module) + log_debug("Your kernel apparently lacks built-in %s support. Might be " + "a good idea to compile it in. We'll now try to work around " + "this by loading the module...", kmod_table[i].module); + + if (!ctx) { + ctx = kmod_new(NULL, NULL); + if (!ctx) + return log_oom(); + + kmod_set_log_fn(ctx, systemd_kmod_log, NULL); + kmod_load_resources(ctx); + } + + (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable); + } + +#endif + return 0; +} diff --git a/src/core/kmod-setup.h b/src/core/kmod-setup.h new file mode 100644 index 0000000..801c7bf --- /dev/null +++ b/src/core/kmod-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int kmod_setup(void); diff --git a/src/core/load-dropin.c b/src/core/load-dropin.c new file mode 100644 index 0000000..a50b200 --- /dev/null +++ b/src/core/load-dropin.c @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "conf-parser.h" +#include "fs-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static int unit_name_compatible(const char *a, const char *b) { + _cleanup_free_ char *template = NULL; + int r; + + /* The straightforward case: the symlink name matches the target */ + if (streq(a, b)) + return 1; + + r = unit_name_template(a, &template); + if (r == -EINVAL) + return 0; /* Not a template */ + if (r < 0) + return r; /* OOM, or some other failure. Just skip the warning. */ + + /* An instance name points to a target that is just the template name */ + return streq(template, b); +} + +static int process_deps(Unit *u, UnitDependency dependency, const char *dir_suffix) { + _cleanup_strv_free_ char **paths = NULL; + char **p; + int r; + + r = unit_file_find_dropin_paths(NULL, + u->manager->lookup_paths.search_path, + u->manager->unit_path_cache, + dir_suffix, + NULL, + u->names, + &paths); + if (r < 0) + return r; + + STRV_FOREACH(p, paths) { + _cleanup_free_ char *target = NULL; + const char *entry; + + entry = basename(*p); + + if (null_or_empty_path(*p) > 0) { + /* an error usually means an invalid symlink, which is not a mask */ + log_unit_debug(u, "%s dependency on %s is masked by %s, ignoring.", + unit_dependency_to_string(dependency), entry, *p); + continue; + } + + r = is_symlink(*p); + if (r < 0) { + log_unit_warning_errno(u, r, "%s dropin %s unreadable, ignoring: %m", + unit_dependency_to_string(dependency), *p); + continue; + } + if (r == 0) { + log_unit_warning(u, "%s dependency dropin %s is not a symlink, ignoring.", + unit_dependency_to_string(dependency), *p); + continue; + } + + if (!unit_name_is_valid(entry, UNIT_NAME_ANY)) { + log_unit_warning(u, "%s dependency dropin %s is not a valid unit name, ignoring.", + unit_dependency_to_string(dependency), *p); + continue; + } + + r = readlink_malloc(*p, &target); + if (r < 0) { + log_unit_warning_errno(u, r, "readlink(\"%s\") failed, ignoring: %m", *p); + continue; + } + + /* We don't treat this as an error, especially because we didn't check this for a + * long time. Nevertheless, we warn, because such mismatch can be mighty confusing. */ + r = unit_name_compatible(entry, basename(target)); + if (r < 0) { + log_unit_warning_errno(u, r, "Can't check if names %s and %s are compatible, ignoring: %m", entry, basename(target)); + continue; + } + if (r == 0) + log_unit_warning(u, "%s dependency dropin %s target %s has different name", + unit_dependency_to_string(dependency), *p, target); + + r = unit_add_dependency_by_name(u, dependency, entry, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add %s dependency on %s, ignoring: %m", + unit_dependency_to_string(dependency), entry); + } + + return 0; +} + +int unit_load_dropin(Unit *u) { + _cleanup_strv_free_ char **l = NULL; + char **f; + int r; + + assert(u); + + /* Load dependencies from .wants and .requires directories */ + r = process_deps(u, UNIT_WANTS, ".wants"); + if (r < 0) + return r; + + r = process_deps(u, UNIT_REQUIRES, ".requires"); + if (r < 0) + return r; + + /* Load .conf dropins */ + r = unit_find_dropin_paths(u, &l); + if (r <= 0) + return 0; + + if (!u->dropin_paths) + u->dropin_paths = TAKE_PTR(l); + else { + r = strv_extend_strv(&u->dropin_paths, l, true); + if (r < 0) + return log_oom(); + } + + STRV_FOREACH(f, u->dropin_paths) + (void) config_parse(u->id, *f, NULL, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + 0, u); + + u->dropin_mtime = now(CLOCK_REALTIME); + + return 0; +} diff --git a/src/core/load-dropin.h b/src/core/load-dropin.h new file mode 100644 index 0000000..bb10a76 --- /dev/null +++ b/src/core/load-dropin.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "dropin.h" +#include "unit.h" + +/* Read service data supplementary drop-in directories */ + +static inline int unit_find_dropin_paths(Unit *u, char ***paths) { + assert(u); + + return unit_file_find_dropin_conf_paths(NULL, + u->manager->lookup_paths.search_path, + u->manager->unit_path_cache, + u->names, + paths); +} + +int unit_load_dropin(Unit *u); diff --git a/src/core/load-fragment-gperf-nulstr.awk b/src/core/load-fragment-gperf-nulstr.awk new file mode 100644 index 0000000..44bc1fb --- /dev/null +++ b/src/core/load-fragment-gperf-nulstr.awk @@ -0,0 +1,14 @@ +BEGIN{ + keywords=0 ; FS="," ; + print "extern const char load_fragment_gperf_nulstr[];" ; + print "const char load_fragment_gperf_nulstr[] =" +} +keyword==1 { + print "\"" $1 "\\0\"" +} +/%%/ { + keyword=1 +} +END { + print ";" +} diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 new file mode 100644 index 0000000..cdbc67f --- /dev/null +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -0,0 +1,459 @@ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include <stddef.h> +#include "conf-parser.h" +#include "load-fragment.h" +#include "missing.h" + +#include "all-units.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name load_fragment_gperf_hash +%define lookup-function-name load_fragment_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +m4_dnl Define the context options only once +m4_define(`EXEC_CONTEXT_CONFIG_ITEMS', +`$1.WorkingDirectory, config_parse_working_directory, 0, offsetof($1, exec_context) +$1.RootDirectory, config_parse_unit_path_printf, true, offsetof($1, exec_context.root_directory) +$1.RootImage, config_parse_unit_path_printf, true, offsetof($1, exec_context.root_image) +$1.User, config_parse_user_group, 0, offsetof($1, exec_context.user) +$1.Group, config_parse_user_group, 0, offsetof($1, exec_context.group) +$1.SupplementaryGroups, config_parse_user_group_strv, 0, offsetof($1, exec_context.supplementary_groups) +$1.Nice, config_parse_exec_nice, 0, offsetof($1, exec_context) +$1.OOMScoreAdjust, config_parse_exec_oom_score_adjust, 0, offsetof($1, exec_context) +$1.IOSchedulingClass, config_parse_exec_io_class, 0, offsetof($1, exec_context) +$1.IOSchedulingPriority, config_parse_exec_io_priority, 0, offsetof($1, exec_context) +$1.CPUSchedulingPolicy, config_parse_exec_cpu_sched_policy, 0, offsetof($1, exec_context) +$1.CPUSchedulingPriority, config_parse_exec_cpu_sched_prio, 0, offsetof($1, exec_context) +$1.CPUSchedulingResetOnFork, config_parse_bool, 0, offsetof($1, exec_context.cpu_sched_reset_on_fork) +$1.CPUAffinity, config_parse_exec_cpu_affinity, 0, offsetof($1, exec_context) +$1.UMask, config_parse_mode, 0, offsetof($1, exec_context.umask) +$1.Environment, config_parse_environ, 0, offsetof($1, exec_context.environment) +$1.EnvironmentFile, config_parse_unit_env_file, 0, offsetof($1, exec_context.environment_files) +$1.PassEnvironment, config_parse_pass_environ, 0, offsetof($1, exec_context.pass_environment) +$1.UnsetEnvironment, config_parse_unset_environ, 0, offsetof($1, exec_context.unset_environment) +$1.DynamicUser, config_parse_bool, true, offsetof($1, exec_context.dynamic_user) +$1.RemoveIPC, config_parse_bool, 0, offsetof($1, exec_context.remove_ipc) +$1.StandardInput, config_parse_exec_input, 0, offsetof($1, exec_context) +$1.StandardOutput, config_parse_exec_output, 0, offsetof($1, exec_context) +$1.StandardError, config_parse_exec_output, 0, offsetof($1, exec_context) +$1.StandardInputText, config_parse_exec_input_text, 0, offsetof($1, exec_context) +$1.StandardInputData, config_parse_exec_input_data, 0, offsetof($1, exec_context) +$1.TTYPath, config_parse_unit_path_printf, 0, offsetof($1, exec_context.tty_path) +$1.TTYReset, config_parse_bool, 0, offsetof($1, exec_context.tty_reset) +$1.TTYVHangup, config_parse_bool, 0, offsetof($1, exec_context.tty_vhangup) +$1.TTYVTDisallocate, config_parse_bool, 0, offsetof($1, exec_context.tty_vt_disallocate) +$1.SyslogIdentifier, config_parse_unit_string_printf, 0, offsetof($1, exec_context.syslog_identifier) +$1.SyslogFacility, config_parse_log_facility, 0, offsetof($1, exec_context.syslog_priority) +$1.SyslogLevel, config_parse_log_level, 0, offsetof($1, exec_context.syslog_priority) +$1.SyslogLevelPrefix, config_parse_bool, 0, offsetof($1, exec_context.syslog_level_prefix) +$1.LogLevelMax, config_parse_log_level, 0, offsetof($1, exec_context.log_level_max) +$1.LogRateLimitIntervalSec, config_parse_sec, 0, offsetof($1, exec_context.log_rate_limit_interval_usec) +$1.LogRateLimitBurst, config_parse_unsigned, 0, offsetof($1, exec_context.log_rate_limit_burst) +$1.LogExtraFields, config_parse_log_extra_fields, 0, offsetof($1, exec_context) +$1.Capabilities, config_parse_warn_compat, DISABLED_LEGACY, offsetof($1, exec_context) +$1.SecureBits, config_parse_exec_secure_bits, 0, offsetof($1, exec_context.secure_bits) +$1.CapabilityBoundingSet, config_parse_capability_set, 0, offsetof($1, exec_context.capability_bounding_set) +$1.AmbientCapabilities, config_parse_capability_set, 0, offsetof($1, exec_context.capability_ambient_set) +$1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec) +$1.NoNewPrivileges, config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges) +$1.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof($1, exec_context.keyring_mode) +m4_ifdef(`HAVE_SECCOMP', +`$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context) +$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) +$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context) +$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) +$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context) +$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) +$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context) +$1.LockPersonality, config_parse_bool, 0, offsetof($1, exec_context.lock_personality)', +`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +$1.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof($1, exec_context.rlimit) +$1.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit) +$1.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof($1, exec_context.rlimit) +$1.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof($1, exec_context.rlimit) +$1.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof($1, exec_context.rlimit) +$1.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof($1, exec_context.rlimit) +$1.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof($1, exec_context.rlimit) +$1.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof($1, exec_context.rlimit) +$1.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof($1, exec_context.rlimit) +$1.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof($1, exec_context.rlimit) +$1.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof($1, exec_context.rlimit) +$1.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof($1, exec_context.rlimit) +$1.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof($1, exec_context.rlimit) +$1.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof($1, exec_context.rlimit) +$1.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof($1, exec_context.rlimit) +$1.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof($1, exec_context.rlimit) +$1.ReadWriteDirectories, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_write_paths) +$1.ReadOnlyDirectories, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths) +$1.InaccessibleDirectories, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) +$1.ReadWritePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_write_paths) +$1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths) +$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) +$1.BindPaths, config_parse_bind_paths, 0, offsetof($1, exec_context) +$1.BindReadOnlyPaths, config_parse_bind_paths, 0, offsetof($1, exec_context) +$1.TemporaryFileSystem, config_parse_temporary_filesystems, 0, offsetof($1, exec_context) +$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) +$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) +$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) +$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) +$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) +$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) +$1.PrivateMounts, config_parse_bool, 0, offsetof($1, exec_context.private_mounts) +$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context.protect_system) +$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context.protect_home) +$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context.mount_flags) +$1.MountAPIVFS, config_parse_bool, 0, offsetof($1, exec_context.mount_apivfs) +$1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality) +$1.RuntimeDirectoryPreserve, config_parse_runtime_preserve_mode, 0, offsetof($1, exec_context.runtime_directory_preserve_mode) +$1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode) +$1.RuntimeDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_RUNTIME].paths) +$1.StateDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_STATE].mode) +$1.StateDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_STATE].paths) +$1.CacheDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CACHE].mode) +$1.CacheDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CACHE].paths) +$1.LogsDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_LOGS].mode) +$1.LogsDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_LOGS].paths) +$1.ConfigurationDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].mode) +$1.ConfigurationDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].paths) +m4_ifdef(`HAVE_PAM', +`$1.PAMName, config_parse_unit_string_printf, 0, offsetof($1, exec_context.pam_name)', +`$1.PAMName, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +$1.IgnoreSIGPIPE, config_parse_bool, 0, offsetof($1, exec_context.ignore_sigpipe) +$1.UtmpIdentifier, config_parse_unit_string_printf, 0, offsetof($1, exec_context.utmp_id) +$1.UtmpMode, config_parse_exec_utmp_mode, 0, offsetof($1, exec_context.utmp_mode) +m4_ifdef(`HAVE_SELINUX', +`$1.SELinuxContext, config_parse_exec_selinux_context, 0, offsetof($1, exec_context)', +`$1.SELinuxContext, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +m4_ifdef(`HAVE_APPARMOR', +`$1.AppArmorProfile, config_parse_exec_apparmor_profile, 0, offsetof($1, exec_context)', +`$1.AppArmorProfile, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +m4_ifdef(`ENABLE_SMACK', +`$1.SmackProcessLabel, config_parse_exec_smack_process_label, 0, offsetof($1, exec_context)', +`$1.SmackProcessLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')' +)m4_dnl +m4_define(`KILL_CONTEXT_CONFIG_ITEMS', +`$1.SendSIGKILL, config_parse_bool, 0, offsetof($1, kill_context.send_sigkill) +$1.SendSIGHUP, config_parse_bool, 0, offsetof($1, kill_context.send_sighup) +$1.KillMode, config_parse_kill_mode, 0, offsetof($1, kill_context.kill_mode) +$1.KillSignal, config_parse_signal, 0, offsetof($1, kill_context.kill_signal) +$1.FinalKillSignal, config_parse_signal, 0, offsetof($1, kill_context.final_kill_signal) +$1.WatchdogSignal, config_parse_signal, 0, offsetof($1, kill_context.watchdog_signal)' +)m4_dnl +m4_define(`CGROUP_CONTEXT_CONFIG_ITEMS', +`$1.Slice, config_parse_unit_slice, 0, 0 +$1.CPUAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.cpu_accounting) +$1.CPUWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.cpu_weight) +$1.StartupCPUWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.startup_cpu_weight) +$1.CPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.cpu_shares) +$1.StartupCPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.startup_cpu_shares) +$1.CPUQuota, config_parse_cpu_quota, 0, offsetof($1, cgroup_context) +$1.MemoryAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.memory_accounting) +$1.MemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryLow, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryHigh, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryMax, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemorySwapMax, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryLimit, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.DeviceAllow, config_parse_device_allow, 0, offsetof($1, cgroup_context) +$1.DevicePolicy, config_parse_device_policy, 0, offsetof($1, cgroup_context.device_policy) +$1.IOAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.io_accounting) +$1.IOWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.io_weight) +$1.StartupIOWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.startup_io_weight) +$1.IODeviceWeight, config_parse_io_device_weight, 0, offsetof($1, cgroup_context) +$1.IOReadBandwidthMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IOWriteBandwidthMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IOReadIOPSMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IOWriteIOPSMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IODeviceLatencyTargetSec, config_parse_io_device_latency, 0, offsetof($1, cgroup_context) +$1.BlockIOAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.blockio_accounting) +$1.BlockIOWeight, config_parse_blockio_weight, 0, offsetof($1, cgroup_context.blockio_weight) +$1.StartupBlockIOWeight, config_parse_blockio_weight, 0, offsetof($1, cgroup_context.startup_blockio_weight) +$1.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof($1, cgroup_context) +$1.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) +$1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) +$1.TasksAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.tasks_accounting) +$1.TasksMax, config_parse_tasks_max, 0, offsetof($1, cgroup_context.tasks_max) +$1.Delegate, config_parse_delegate, 0, offsetof($1, cgroup_context) +$1.DisableControllers, config_parse_disable_controllers, 0, offsetof($1, cgroup_context) +$1.IPAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.ip_accounting) +$1.IPAddressAllow, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_allow) +$1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny) +$1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' +)m4_dnl +Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation) +Unit.SourcePath, config_parse_unit_path_printf, 0, offsetof(Unit, source_path) +Unit.Requires, config_parse_unit_deps, UNIT_REQUIRES, 0 +Unit.Requisite, config_parse_unit_deps, UNIT_REQUISITE, 0 +Unit.Wants, config_parse_unit_deps, UNIT_WANTS, 0 +Unit.BindsTo, config_parse_unit_deps, UNIT_BINDS_TO, 0 +Unit.BindTo, config_parse_unit_deps, UNIT_BINDS_TO, 0 +Unit.Conflicts, config_parse_unit_deps, UNIT_CONFLICTS, 0 +Unit.Before, config_parse_unit_deps, UNIT_BEFORE, 0 +Unit.After, config_parse_unit_deps, UNIT_AFTER, 0 +Unit.OnFailure, config_parse_unit_deps, UNIT_ON_FAILURE, 0 +Unit.PropagatesReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0 +Unit.PropagateReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0 +Unit.ReloadPropagatedFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0 +Unit.PropagateReloadFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0 +Unit.PartOf, config_parse_unit_deps, UNIT_PART_OF, 0 +Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0 +Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0 +Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0 +Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0 +Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded) +Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start) +Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop) +Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate) +Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies) +Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode) +m4_dnl The following is a legacy alias name for compatibility +Unit.OnFailureIsolate, config_parse_job_mode_isolate, 0, offsetof(Unit, on_failure_job_mode) +Unit.IgnoreOnIsolate, config_parse_bool, 0, offsetof(Unit, ignore_on_isolate) +Unit.IgnoreOnSnapshot, config_parse_warn_compat, DISABLED_LEGACY, 0 +Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0 +Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0 +Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action) +Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg) +Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_limit.interval) +m4_dnl The following is a legacy alias name for compatibility +Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_limit.interval) +Unit.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_limit.burst) +Unit.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action) +Unit.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action) +Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action) +Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status) +Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status) +Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions) +Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions) +Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions) +Unit.ConditionPathIsSymbolicLink,config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK,offsetof(Unit, conditions) +Unit.ConditionPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, conditions) +Unit.ConditionPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, conditions) +Unit.ConditionDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, conditions) +Unit.ConditionFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, conditions) +Unit.ConditionFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, conditions) +Unit.ConditionNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, conditions) +Unit.ConditionFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, conditions) +Unit.ConditionKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, conditions) +Unit.ConditionKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, conditions) +Unit.ConditionArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, conditions) +Unit.ConditionVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, conditions) +Unit.ConditionSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, conditions) +Unit.ConditionCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, conditions) +Unit.ConditionHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, conditions) +Unit.ConditionACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, conditions) +Unit.ConditionUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, conditions) +Unit.ConditionGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, conditions) +Unit.ConditionControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, conditions) +Unit.ConditionNull, config_parse_unit_condition_null, 0, offsetof(Unit, conditions) +Unit.AssertPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, asserts) +Unit.AssertPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, asserts) +Unit.AssertPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, asserts) +Unit.AssertPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK,offsetof(Unit, asserts) +Unit.AssertPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, asserts) +Unit.AssertPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, asserts) +Unit.AssertDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, asserts) +Unit.AssertFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, asserts) +Unit.AssertFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, asserts) +Unit.AssertNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, asserts) +Unit.AssertFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, asserts) +Unit.AssertKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, asserts) +Unit.AssertKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, asserts) +Unit.AssertArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, asserts) +Unit.AssertVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, asserts) +Unit.AssertSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, asserts) +Unit.AssertCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, asserts) +Unit.AssertHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, asserts) +Unit.AssertACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, asserts) +Unit.AssertUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, asserts) +Unit.AssertGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, asserts) +Unit.AssertControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, asserts) +Unit.AssertNull, config_parse_unit_condition_null, 0, offsetof(Unit, asserts) +Unit.CollectMode, config_parse_collect_mode, 0, offsetof(Unit, collect_mode) +m4_dnl +Service.PIDFile, config_parse_pid_file, 0, offsetof(Service, pid_file) +Service.ExecStartPre, config_parse_exec, SERVICE_EXEC_START_PRE, offsetof(Service, exec_command) +Service.ExecStart, config_parse_exec, SERVICE_EXEC_START, offsetof(Service, exec_command) +Service.ExecStartPost, config_parse_exec, SERVICE_EXEC_START_POST, offsetof(Service, exec_command) +Service.ExecReload, config_parse_exec, SERVICE_EXEC_RELOAD, offsetof(Service, exec_command) +Service.ExecStop, config_parse_exec, SERVICE_EXEC_STOP, offsetof(Service, exec_command) +Service.ExecStopPost, config_parse_exec, SERVICE_EXEC_STOP_POST, offsetof(Service, exec_command) +Service.RestartSec, config_parse_sec, 0, offsetof(Service, restart_usec) +Service.TimeoutSec, config_parse_service_timeout, 0, 0 +Service.TimeoutStartSec, config_parse_service_timeout, 0, 0 +Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec) +Service.RuntimeMaxSec, config_parse_sec, 0, offsetof(Service, runtime_max_usec) +Service.WatchdogSec, config_parse_sec, 0, offsetof(Service, watchdog_usec) +m4_dnl The following five only exist for compatibility, they moved into Unit, see above +Service.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_limit.interval) +Service.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_limit.burst) +Service.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action) +Service.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action) +Service.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Service.Type, config_parse_service_type, 0, offsetof(Service, type) +Service.Restart, config_parse_service_restart, 0, offsetof(Service, restart) +Service.PermissionsStartOnly, config_parse_bool, 0, offsetof(Service, permissions_start_only) +Service.RootDirectoryStartOnly, config_parse_bool, 0, offsetof(Service, root_directory_start_only) +Service.RemainAfterExit, config_parse_bool, 0, offsetof(Service, remain_after_exit) +Service.GuessMainPID, config_parse_bool, 0, offsetof(Service, guess_main_pid) +Service.RestartPreventExitStatus, config_parse_set_status, 0, offsetof(Service, restart_prevent_status) +Service.RestartForceExitStatus, config_parse_set_status, 0, offsetof(Service, restart_force_status) +Service.SuccessExitStatus, config_parse_set_status, 0, offsetof(Service, success_status) +Service.SysVStartPriority, config_parse_warn_compat, DISABLED_LEGACY, 0 +Service.NonBlocking, config_parse_bool, 0, offsetof(Service, exec_context.non_blocking) +Service.BusName, config_parse_bus_name, 0, offsetof(Service, bus_name) +Service.FileDescriptorStoreMax, config_parse_unsigned, 0, offsetof(Service, n_fd_store_max) +Service.NotifyAccess, config_parse_notify_access, 0, offsetof(Service, notify_access) +Service.Sockets, config_parse_service_sockets, 0, 0 +Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0 +Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors) +Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings) +EXEC_CONTEXT_CONFIG_ITEMS(Service)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Service)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Service)m4_dnl +m4_dnl +Socket.ListenStream, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenDatagram, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenSequentialPacket, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenFIFO, config_parse_socket_listen, SOCKET_FIFO, 0 +Socket.ListenNetlink, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenSpecial, config_parse_socket_listen, SOCKET_SPECIAL, 0 +Socket.ListenMessageQueue, config_parse_socket_listen, SOCKET_MQUEUE, 0 +Socket.ListenUSBFunction, config_parse_socket_listen, SOCKET_USB_FUNCTION, 0 +Socket.SocketProtocol, config_parse_socket_protocol, 0, offsetof(Socket, socket_protocol) +Socket.BindIPv6Only, config_parse_socket_bind, 0, offsetof(Socket, bind_ipv6_only) +Socket.Backlog, config_parse_unsigned, 0, offsetof(Socket, backlog) +Socket.BindToDevice, config_parse_socket_bindtodevice, 0, 0 +Socket.ExecStartPre, config_parse_exec, SOCKET_EXEC_START_PRE, offsetof(Socket, exec_command) +Socket.ExecStartPost, config_parse_exec, SOCKET_EXEC_START_POST, offsetof(Socket, exec_command) +Socket.ExecStopPre, config_parse_exec, SOCKET_EXEC_STOP_PRE, offsetof(Socket, exec_command) +Socket.ExecStopPost, config_parse_exec, SOCKET_EXEC_STOP_POST, offsetof(Socket, exec_command) +Socket.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Socket, timeout_usec) +Socket.SocketUser, config_parse_user_group, 0, offsetof(Socket, user) +Socket.SocketGroup, config_parse_user_group, 0, offsetof(Socket, group) +Socket.SocketMode, config_parse_mode, 0, offsetof(Socket, socket_mode) +Socket.DirectoryMode, config_parse_mode, 0, offsetof(Socket, directory_mode) +Socket.Accept, config_parse_bool, 0, offsetof(Socket, accept) +Socket.Writable, config_parse_bool, 0, offsetof(Socket, writable) +Socket.MaxConnections, config_parse_unsigned, 0, offsetof(Socket, max_connections) +Socket.MaxConnectionsPerSource, config_parse_unsigned, 0, offsetof(Socket, max_connections_per_source) +Socket.KeepAlive, config_parse_bool, 0, offsetof(Socket, keep_alive) +Socket.KeepAliveTimeSec, config_parse_sec, 0, offsetof(Socket, keep_alive_time) +Socket.KeepAliveIntervalSec, config_parse_sec, 0, offsetof(Socket, keep_alive_interval) +Socket.KeepAliveProbes, config_parse_unsigned, 0, offsetof(Socket, keep_alive_cnt) +Socket.DeferAcceptSec, config_parse_sec, 0, offsetof(Socket, defer_accept) +Socket.NoDelay, config_parse_bool, 0, offsetof(Socket, no_delay) +Socket.Priority, config_parse_int, 0, offsetof(Socket, priority) +Socket.ReceiveBuffer, config_parse_iec_size, 0, offsetof(Socket, receive_buffer) +Socket.SendBuffer, config_parse_iec_size, 0, offsetof(Socket, send_buffer) +Socket.IPTOS, config_parse_ip_tos, 0, offsetof(Socket, ip_tos) +Socket.IPTTL, config_parse_int, 0, offsetof(Socket, ip_ttl) +Socket.Mark, config_parse_int, 0, offsetof(Socket, mark) +Socket.PipeSize, config_parse_iec_size, 0, offsetof(Socket, pipe_size) +Socket.FreeBind, config_parse_bool, 0, offsetof(Socket, free_bind) +Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent) +Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast) +Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred) +Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec) +Socket.TCPCongestion, config_parse_string, 0, offsetof(Socket, tcp_congestion) +Socket.ReusePort, config_parse_bool, 0, offsetof(Socket, reuse_port) +Socket.MessageQueueMaxMessages, config_parse_long, 0, offsetof(Socket, mq_maxmsg) +Socket.MessageQueueMessageSize, config_parse_long, 0, offsetof(Socket, mq_msgsize) +Socket.RemoveOnStop, config_parse_bool, 0, offsetof(Socket, remove_on_stop) +Socket.Symlinks, config_parse_unit_path_strv_printf, 0, offsetof(Socket, symlinks) +Socket.FileDescriptorName, config_parse_fdname, 0, 0 +Socket.Service, config_parse_socket_service, 0, 0 +Socket.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, trigger_limit.interval) +Socket.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Socket, trigger_limit.burst) +m4_ifdef(`ENABLE_SMACK', +`Socket.SmackLabel, config_parse_unit_string_printf, 0, offsetof(Socket, smack) +Socket.SmackLabelIPIn, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_in) +Socket.SmackLabelIPOut, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_out)', +`Socket.SmackLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +Socket.SmackLabelIPIn, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +Socket.SmackLabelIPOut, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +m4_ifdef(`HAVE_SELINUX', +`Socket.SELinuxContextFromNet, config_parse_bool, 0, offsetof(Socket, selinux_context_from_net)', +`Socket.SELinuxContextFromNet, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +EXEC_CONTEXT_CONFIG_ITEMS(Socket)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Socket)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Socket)m4_dnl +m4_dnl +Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what) +Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where) +Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options) +Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype) +Mount.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Mount, timeout_usec) +Mount.DirectoryMode, config_parse_mode, 0, offsetof(Mount, directory_mode) +Mount.SloppyOptions, config_parse_bool, 0, offsetof(Mount, sloppy_options) +Mount.LazyUnmount, config_parse_bool, 0, offsetof(Mount, lazy_unmount) +Mount.ForceUnmount, config_parse_bool, 0, offsetof(Mount, force_unmount) +EXEC_CONTEXT_CONFIG_ITEMS(Mount)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Mount)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Mount)m4_dnl +m4_dnl +Automount.Where, config_parse_unit_path_printf, 0, offsetof(Automount, where) +Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode) +Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec) +m4_dnl +Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what) +Swap.Priority, config_parse_int, 0, offsetof(Swap, parameters_fragment.priority) +Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options) +Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec) +EXEC_CONTEXT_CONFIG_ITEMS(Swap)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Swap)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Swap)m4_dnl +m4_dnl +Timer.OnCalendar, config_parse_timer, 0, 0 +Timer.OnActiveSec, config_parse_timer, 0, 0 +Timer.OnBootSec, config_parse_timer, 0, 0 +Timer.OnStartupSec, config_parse_timer, 0, 0 +Timer.OnUnitActiveSec, config_parse_timer, 0, 0 +Timer.OnUnitInactiveSec, config_parse_timer, 0, 0 +Timer.Persistent, config_parse_bool, 0, offsetof(Timer, persistent) +Timer.WakeSystem, config_parse_bool, 0, offsetof(Timer, wake_system) +Timer.RemainAfterElapse, config_parse_bool, 0, offsetof(Timer, remain_after_elapse) +Timer.AccuracySec, config_parse_sec, 0, offsetof(Timer, accuracy_usec) +Timer.RandomizedDelaySec, config_parse_sec, 0, offsetof(Timer, random_usec) +Timer.Unit, config_parse_trigger_unit, 0, 0 +m4_dnl +Path.PathExists, config_parse_path_spec, 0, 0 +Path.PathExistsGlob, config_parse_path_spec, 0, 0 +Path.PathChanged, config_parse_path_spec, 0, 0 +Path.PathModified, config_parse_path_spec, 0, 0 +Path.DirectoryNotEmpty, config_parse_path_spec, 0, 0 +Path.Unit, config_parse_trigger_unit, 0, 0 +Path.MakeDirectory, config_parse_bool, 0, offsetof(Path, make_directory) +Path.DirectoryMode, config_parse_mode, 0, offsetof(Path, directory_mode) +m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Slice)m4_dnl +m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Scope)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Scope)m4_dnl +Scope.TimeoutStopSec, config_parse_sec, 0, offsetof(Scope, timeout_stop_usec) +m4_dnl The [Install] section is ignored here. +Install.Alias, NULL, 0, 0 +Install.WantedBy, NULL, 0, 0 +Install.RequiredBy, NULL, 0, 0 +Install.Also, NULL, 0, 0 +Install.DefaultInstance, NULL, 0, 0 diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c new file mode 100644 index 0000000..44f00a3 --- /dev/null +++ b/src/core/load-fragment.c @@ -0,0 +1,4828 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2012 Holger Hans Peter Freyther +***/ + +#include <errno.h> +#include <fcntl.h> +#include <linux/fs.h> +#include <linux/oom.h> +#if HAVE_SECCOMP +#include <seccomp.h> +#endif +#include <sched.h> +#include <string.h> +#include <sys/resource.h> +#include <sys/stat.h> + +#include "af-list.h" +#include "alloc-util.h" +#include "all-units.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup.h" +#include "conf-parser.h" +#include "cpu-set-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "fd-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "ioprio.h" +#include "ip-protocol-list.h" +#include "journal-util.h" +#include "load-fragment.h" +#include "log.h" +#include "missing.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "securebits-util.h" +#include "signal-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "user-util.h" +#include "web-util.h" + +static int parse_socket_protocol(const char *s) { + int r; + + r = parse_ip_protocol(s); + if (r < 0) + return r; + if (!IN_SET(r, IPPROTO_UDPLITE, IPPROTO_SCTP)) + return -EPROTONOSUPPORT; + + return r; +} + +DEFINE_CONFIG_PARSE(config_parse_socket_protocol, parse_socket_protocol, "Failed to parse socket protocol"); +DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Failed to parse secure bits"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_kill_mode, kill_mode, KillMode, "Failed to parse kill mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_home, protect_home, ProtectHome, "Failed to parse protect home value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_system, protect_system, ProtectSystem, "Failed to parse protect system value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_runtime_preserve_mode, exec_preserve_mode, ExecPreserveMode, "Failed to parse runtime directory preserve mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); +DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); +DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); +DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); +DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares"); +DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_flags, mount_propagation_flags_from_string, unsigned long, "Failed to parse mount flag"); + +int config_parse_unit_deps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + UnitDependency d = ltype; + Unit *u = userdata; + const char *p; + + assert(filename); + assert(lvalue); + assert(rvalue); + + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + int r; + + r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } + + r = unit_name_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + r = unit_add_dependency_by_name(u, d, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + } + + return 0; +} + +int config_parse_obsolete_unit_deps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit dependency type %s= is obsolete, replacing by %s=, please update your unit file", lvalue, unit_dependency_to_string(ltype)); + + return config_parse_unit_deps(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_unit_string_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_strv_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + return config_parse_strv(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_path_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + Unit *u = userdata; + int r; + bool fatal = ltype; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + /* Let's not bother with anything that is too long */ + if (strlen(rvalue) >= PATH_MAX) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "%s value too long%s.", + lvalue, fatal ? "" : ", ignoring"); + return fatal ? -ENAMETOOLONG : 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, fatal ? "" : ", ignoring"); + return fatal ? -ENOEXEC : 0; + } + + return config_parse_path(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_path_strv_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***x = data; + Unit *u = userdata; + int r; + const char *p; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + *x = strv_free(*x); + return 0; + } + + for (p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_push(x, k); + if (r < 0) + return log_oom(); + k = NULL; + } +} + +int config_parse_socket_listen(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ SocketPort *p = NULL; + SocketPort *tail; + Socket *s; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + s = SOCKET(data); + + if (isempty(rvalue)) { + /* An empty assignment removes all ports */ + socket_free_ports(s); + return 0; + } + + p = new0(SocketPort, 1); + if (!p) + return log_oom(); + + if (ltype != SOCKET_SOCKET) { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + free_and_replace(p->path, k); + p->type = ltype; + + } else if (streq(lvalue, "ListenNetlink")) { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = socket_address_parse_netlink(&p->address, k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k); + return 0; + } + + p->type = SOCKET_SOCKET; + + } else { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = socket_address_parse_and_warn(&p->address, k); + if (r < 0) { + if (r != -EAFNOSUPPORT) + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k); + return 0; + } + + if (streq(lvalue, "ListenStream")) + p->address.type = SOCK_STREAM; + else if (streq(lvalue, "ListenDatagram")) + p->address.type = SOCK_DGRAM; + else { + assert(streq(lvalue, "ListenSequentialPacket")); + p->address.type = SOCK_SEQPACKET; + } + + if (socket_address_family(&p->address) != AF_LOCAL && p->address.type == SOCK_SEQPACKET) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Address family not supported, ignoring: %s", rvalue); + return 0; + } + + p->type = SOCKET_SOCKET; + } + + p->fd = -1; + p->auxiliary_fds = NULL; + p->n_auxiliary_fds = 0; + p->socket = s; + + LIST_FIND_TAIL(port, s->ports, tail); + LIST_INSERT_AFTER(port, s->ports, tail, p); + + p = NULL; + + return 0; +} + +int config_parse_exec_nice( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int priority, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->nice_set = false; + return 0; + } + + r = parse_nice(rvalue, &priority); + if (r < 0) { + if (r == -ERANGE) + log_syntax(unit, LOG_ERR, filename, line, r, "Nice priority out of range, ignoring: %s", rvalue); + else + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse nice priority '%s', ignoring: %m", rvalue); + return 0; + } + + c->nice = priority; + c->nice_set = true; + + return 0; +} + +int config_parse_exec_oom_score_adjust( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int oa, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->oom_score_adjust_set = false; + return 0; + } + + r = parse_oom_score_adjust(rvalue, &oa); + if (r < 0) { + if (r == -ERANGE) + log_syntax(unit, LOG_ERR, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue); + else + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue); + return 0; + } + + c->oom_score_adjust = oa; + c->oom_score_adjust_set = true; + + return 0; +} + +int config_parse_exec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecCommand **e = data; + Unit *u = userdata; + const char *p; + bool semicolon; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(e); + + e += ltype; + rvalue += strspn(rvalue, WHITESPACE); + + if (isempty(rvalue)) { + /* An empty assignment resets the list */ + *e = exec_command_free_list(*e); + return 0; + } + + p = rvalue; + do { + _cleanup_free_ char *path = NULL, *firstword = NULL; + ExecCommandFlags flags = 0; + bool ignore = false, separate_argv0 = false; + _cleanup_free_ ExecCommand *nce = NULL; + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0, nbufsize = 0; + const char *f; + + semicolon = false; + + r = extract_first_word_and_warn(&p, &firstword, NULL, EXTRACT_QUOTES|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); + if (r <= 0) + return 0; + + f = firstword; + for (;;) { + /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't + * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of + * argv[0]; if it's prefixed with +, it will be run with full privileges and no sandboxing; if + * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if + * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient + * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most + * other sandboxing, with some special exceptions for changing UID. + * + * The idea is that '!!' may be used to write services that can take benefit of systemd's + * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to + * privilege dropping within the daemon if the kernel does not offer that. */ + + if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) { + flags |= EXEC_COMMAND_IGNORE_FAILURE; + ignore = true; + } else if (*f == '@' && !separate_argv0) + separate_argv0 = true; + else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) + flags |= EXEC_COMMAND_FULLY_PRIVILEGED; + else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) + flags |= EXEC_COMMAND_NO_SETUID; + else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) { + flags &= ~EXEC_COMMAND_NO_SETUID; + flags |= EXEC_COMMAND_AMBIENT_MAGIC; + } else + break; + f++; + } + + r = unit_full_printf(u, f, &path); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + f, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + if (isempty(path)) { + /* First word is either "-" or "@" with no command. */ + log_syntax(unit, LOG_ERR, filename, line, 0, + "Empty path in command line%s: '%s'", + ignore ? ", ignoring" : "", rvalue); + return ignore ? 0 : -ENOEXEC; + } + if (!string_is_safe(path)) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Executable name contains special characters%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + if (endswith(path, "/")) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Executable path specifies a directory%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + + if (!path_is_absolute(path)) { + const char *prefix; + bool found = false; + + if (!filename_is_valid(path)) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Neither a valid executable name nor an absolute path%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + + /* Resolve a single-component name to a full path */ + NULSTR_FOREACH(prefix, DEFAULT_PATH_NULSTR) { + _cleanup_free_ char *fullpath = NULL; + + fullpath = strjoin(prefix, "/", path); + if (!fullpath) + return log_oom(); + + if (access(fullpath, F_OK) >= 0) { + free_and_replace(path, fullpath); + found = true; + break; + } + } + + if (!found) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Executable \"%s\" not found in path \"%s\"%s", + path, DEFAULT_PATH, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + } + + if (!separate_argv0) { + char *w = NULL; + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + w = strdup(path); + if (!w) + return log_oom(); + n[nlen++] = w; + n[nlen] = NULL; + } + + path_simplify(path, false); + + while (!isempty(p)) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + /* Check explicitly for an unquoted semicolon as + * command separator token. */ + if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) { + p++; + p += strspn(p, WHITESPACE); + semicolon = true; + break; + } + + /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc. + * extract_first_word() would return the same for all of those. */ + if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) { + char *w; + + p += 2; + p += strspn(p, WHITESPACE); + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + w = strdup(";"); + if (!w) + return log_oom(); + n[nlen++] = w; + n[nlen] = NULL; + continue; + } + + r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_QUOTES|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); + if (r == 0) + break; + if (r < 0) + return ignore ? 0 : -ENOEXEC; + + r = unit_full_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in %s%s: %m", + word, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(resolved); + n[nlen] = NULL; + } + + if (!n || !n[0]) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Empty executable name or zeroeth argument%s: %s", + ignore ? ", ignoring" : "", rvalue); + return ignore ? 0 : -ENOEXEC; + } + + nce = new0(ExecCommand, 1); + if (!nce) + return log_oom(); + + nce->argv = TAKE_PTR(n); + nce->path = TAKE_PTR(path); + nce->flags = flags; + + exec_command_append_list(e, nce); + + /* Do not _cleanup_free_ these. */ + nce = NULL; + + rvalue = p; + } while (semicolon); + + return 0; +} + +int config_parse_socket_bindtodevice( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Socket *s = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue) || streq(rvalue, "*")) { + s->bind_to_device = mfree(s->bind_to_device); + return 0; + } + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid interface name, ignoring: %s", rvalue); + return 0; + } + + if (free_and_strdup(&s->bind_to_device, rvalue) < 0) + return log_oom(); + + return 0; +} + +int config_parse_exec_input( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + const char *n; + ExecInput ei; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + n = startswith(rvalue, "fd:"); + if (n) { + _cleanup_free_ char *resolved = NULL; + + r = unit_full_printf(u, n, &resolved); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s': %m", n); + + if (isempty(resolved)) + resolved = mfree(resolved); + else if (!fdname_is_valid(resolved)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid file descriptor name: %s", resolved); + return -ENOEXEC; + } + + free_and_replace(c->stdio_fdname[STDIN_FILENO], resolved); + + ei = EXEC_INPUT_NAMED_FD; + + } else if ((n = startswith(rvalue, "file:"))) { + _cleanup_free_ char *resolved = NULL; + + r = unit_full_printf(u, n, &resolved); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s': %m", n); + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return -ENOEXEC; + + free_and_replace(c->stdio_file[STDIN_FILENO], resolved); + + ei = EXEC_INPUT_FILE; + + } else { + ei = exec_input_from_string(rvalue); + if (ei < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse input specifier, ignoring: %s", rvalue); + return 0; + } + } + + c->std_input = ei; + return 0; +} + +int config_parse_exec_input_text( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *unescaped = NULL, *resolved = NULL; + ExecContext *c = data; + Unit *u = userdata; + size_t sz; + void *p; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + return 0; + } + + r = cunescape(rvalue, 0, &unescaped); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to decode C escaped text '%s': %m", rvalue); + + r = unit_full_printf(u, unescaped, &resolved); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s': %m", unescaped); + + sz = strlen(resolved); + if (c->stdin_data_size + sz + 1 < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz + 1 > EXEC_STDIN_DATA_MAX) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Standard input data too large (%zu), maximum of %zu permitted, ignoring.", c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX); + return -E2BIG; + } + + p = realloc(c->stdin_data, c->stdin_data_size + sz + 1); + if (!p) + return log_oom(); + + *((char*) mempcpy((char*) p + c->stdin_data_size, resolved, sz)) = '\n'; + + c->stdin_data = p; + c->stdin_data_size += sz + 1; + + return 0; +} + +int config_parse_exec_input_data( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *p = NULL; + ExecContext *c = data; + size_t sz; + void *q; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + return 0; + } + + r = unbase64mem(rvalue, (size_t) -1, &p, &sz); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to decode base64 data, ignoring: %s", rvalue); + + assert(sz > 0); + + if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Standard input data too large (%zu), maximum of %zu permitted, ignoring.", c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX); + return -E2BIG; + } + + q = realloc(c->stdin_data, c->stdin_data_size + sz); + if (!q) + return log_oom(); + + memcpy((uint8_t*) q + c->stdin_data_size, p, sz); + + c->stdin_data = q; + c->stdin_data_size += sz; + + return 0; +} + +int config_parse_exec_output( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + const char *n; + ExecContext *c = data; + Unit *u = userdata; + ExecOutput eo; + int r; + + assert(data); + assert(filename); + assert(line); + assert(lvalue); + assert(rvalue); + + n = startswith(rvalue, "fd:"); + if (n) { + r = unit_full_printf(u, n, &resolved); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n); + + if (isempty(resolved)) + resolved = mfree(resolved); + else if (!fdname_is_valid(resolved)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid file descriptor name: %s", resolved); + return -ENOEXEC; + } + + eo = EXEC_OUTPUT_NAMED_FD; + + } else if ((n = startswith(rvalue, "file:"))) { + + r = unit_full_printf(u, n, &resolved); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n); + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return -ENOEXEC; + + eo = EXEC_OUTPUT_FILE; + + } else if ((n = startswith(rvalue, "append:"))) { + + r = unit_full_printf(u, n, &resolved); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n); + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return -ENOEXEC; + + eo = EXEC_OUTPUT_FILE_APPEND; + } else { + eo = exec_output_from_string(rvalue); + if (eo < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse output specifier, ignoring: %s", rvalue); + return 0; + } + } + + if (streq(lvalue, "StandardOutput")) { + if (eo == EXEC_OUTPUT_NAMED_FD) + free_and_replace(c->stdio_fdname[STDOUT_FILENO], resolved); + else + free_and_replace(c->stdio_file[STDOUT_FILENO], resolved); + + c->std_output = eo; + + } else { + assert(streq(lvalue, "StandardError")); + + if (eo == EXEC_OUTPUT_NAMED_FD) + free_and_replace(c->stdio_fdname[STDERR_FILENO], resolved); + else + free_and_replace(c->stdio_file[STDERR_FILENO], resolved); + + c->std_error = eo; + } + + return 0; +} + +int config_parse_exec_io_class(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int x; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->ioprio_set = false; + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0); + return 0; + } + + x = ioprio_class_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse IO scheduling class, ignoring: %s", rvalue); + return 0; + } + + c->ioprio = IOPRIO_PRIO_VALUE(x, IOPRIO_PRIO_DATA(c->ioprio)); + c->ioprio_set = true; + + return 0; +} + +int config_parse_exec_io_priority(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int i, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->ioprio_set = false; + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0); + return 0; + } + + r = ioprio_parse_priority(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse IO priority, ignoring: %s", rvalue); + return 0; + } + + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_PRIO_CLASS(c->ioprio), i); + c->ioprio_set = true; + + return 0; +} + +int config_parse_exec_cpu_sched_policy(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int x; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->cpu_sched_set = false; + c->cpu_sched_policy = SCHED_OTHER; + c->cpu_sched_priority = 0; + return 0; + } + + x = sched_policy_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse CPU scheduling policy, ignoring: %s", rvalue); + return 0; + } + + c->cpu_sched_policy = x; + /* Moving to or from real-time policy? We need to adjust the priority */ + c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(x), sched_get_priority_max(x)); + c->cpu_sched_set = true; + + return 0; +} + +int config_parse_exec_cpu_sched_prio(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int i, min, max, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = safe_atoi(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse CPU scheduling priority, ignoring: %s", rvalue); + return 0; + } + + /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0 */ + min = sched_get_priority_min(c->cpu_sched_policy); + max = sched_get_priority_max(c->cpu_sched_policy); + + if (i < min || i > max) { + log_syntax(unit, LOG_ERR, filename, line, 0, "CPU scheduling priority is out of range, ignoring: %s", rvalue); + return 0; + } + + c->cpu_sched_priority = i; + c->cpu_sched_set = true; + + return 0; +} + +int config_parse_exec_cpu_affinity(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + _cleanup_cpu_free_ cpu_set_t *cpuset = NULL; + int ncpus; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + ncpus = parse_cpu_set_and_warn(rvalue, &cpuset, unit, filename, line, lvalue); + if (ncpus < 0) + return ncpus; + + if (ncpus == 0) { + /* An empty assignment resets the CPU list */ + c->cpuset = cpu_set_mfree(c->cpuset); + c->cpuset_ncpus = 0; + return 0; + } + + if (!c->cpuset) { + c->cpuset = TAKE_PTR(cpuset); + c->cpuset_ncpus = (unsigned) ncpus; + return 0; + } + + if (c->cpuset_ncpus < (unsigned) ncpus) { + CPU_OR_S(CPU_ALLOC_SIZE(c->cpuset_ncpus), cpuset, c->cpuset, cpuset); + CPU_FREE(c->cpuset); + c->cpuset = TAKE_PTR(cpuset); + c->cpuset_ncpus = (unsigned) ncpus; + return 0; + } + + CPU_OR_S(CPU_ALLOC_SIZE((unsigned) ncpus), c->cpuset, c->cpuset, cpuset); + + return 0; +} + +int config_parse_capability_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *capability_set = data; + uint64_t sum = 0, initial = 0; + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (streq(lvalue, "CapabilityBoundingSet")) + initial = CAP_ALL; /* initialized to all bits on */ + /* else "AmbientCapabilities" initialized to all bits off */ + + r = capability_set_from_string(rvalue, &sum); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse %s= specifier '%s', ignoring: %m", lvalue, rvalue); + return 0; + } + + if (sum == 0 || *capability_set == initial) + /* "", "~" or uninitialized data -> replace */ + *capability_set = invert ? ~sum : sum; + else { + /* previous data -> merge */ + if (invert) + *capability_set &= ~sum; + else + *capability_set |= sum; + } + + return 0; +} + +int config_parse_exec_selinux_context( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->selinux_context = mfree(c->selinux_context); + c->selinux_context_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->selinux_context, k); + c->selinux_context_ignore = ignore; + + return 0; +} + +int config_parse_exec_apparmor_profile( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->apparmor_profile = mfree(c->apparmor_profile); + c->apparmor_profile_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->apparmor_profile, k); + c->apparmor_profile_ignore = ignore; + + return 0; +} + +int config_parse_exec_smack_process_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->smack_process_label = mfree(c->smack_process_label); + c->smack_process_label_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->smack_process_label, k); + c->smack_process_label_ignore = ignore; + + return 0; +} + +int config_parse_timer(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Timer *t = data; + usec_t usec = 0; + TimerValue *v; + TimerBase b; + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + Unit *u = userdata; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets list */ + timer_free_values(t); + return 0; + } + + b = timer_base_from_string(lvalue); + if (b < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse timer base, ignoring: %s", lvalue); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (b == TIMER_CALENDAR) { + if (calendar_spec_from_string(k, &c) < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse calendar specification, ignoring: %s", k); + return 0; + } + } else + if (parse_sec(k, &usec) < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse timer value, ignoring: %s", k); + return 0; + } + + v = new0(TimerValue, 1); + if (!v) + return log_oom(); + + v->base = b; + v->value = usec; + v->calendar_spec = TAKE_PTR(c); + + LIST_PREPEND(value, t->values, v); + + return 0; +} + +int config_parse_trigger_unit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Unit *u = data; + UnitType type; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (!hashmap_isempty(u->dependencies[UNIT_TRIGGERS])) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Multiple units to trigger specified, ignoring: %s", rvalue); + return 0; + } + + r = unit_name_printf(u, rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + type = unit_name_to_type(p); + if (type < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Unit type not valid, ignoring: %s", rvalue); + return 0; + } + if (unit_has_name(u, p)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Units cannot trigger themselves, ignoring: %s", rvalue); + return 0; + } + + r = unit_add_two_dependencies_by_name(u, UNIT_BEFORE, UNIT_TRIGGERS, p, true, UNIT_DEPENDENCY_FILE); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to add trigger on %s, ignoring: %m", p); + return 0; + } + + return 0; +} + +int config_parse_path_spec(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Path *p = data; + PathSpec *s; + PathType b; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment clears list */ + path_free_specs(p); + return 0; + } + + b = path_type_from_string(lvalue); + if (b < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse path type, ignoring: %s", lvalue); + return 0; + } + + r = unit_full_printf(UNIT(p), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + s = new0(PathSpec, 1); + if (!s) + return log_oom(); + + s->unit = UNIT(p); + s->path = TAKE_PTR(k); + s->type = b; + s->inotify_fd = -1; + + LIST_PREPEND(spec, p->specs, s); + + return 0; +} + +int config_parse_socket_service( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *p = NULL; + Socket *s = data; + Unit *x; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = unit_name_printf(UNIT(s), rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue); + return -ENOEXEC; + } + + if (!endswith(p, ".service")) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Unit must be of type service: %s", rvalue); + return -ENOEXEC; + } + + r = manager_load_unit(UNIT(s)->manager, p, NULL, &error, &x); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to load unit %s: %s", rvalue, bus_error_message(&error, r)); + return -ENOEXEC; + } + + unit_ref_set(&s->service, UNIT(s), x); + + return 0; +} + +int config_parse_fdname( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Socket *s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + s->fdname = mfree(s->fdname); + return 0; + } + + r = unit_full_printf(UNIT(s), rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (!fdname_is_valid(p)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid file descriptor name, ignoring: %s", p); + return 0; + } + + return free_and_replace(s->fdname, p); +} + +int config_parse_service_sockets( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = data; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Trailing garbage in sockets, ignoring: %s", rvalue); + break; + } + + r = unit_name_printf(UNIT(s), word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + if (!endswith(k, ".socket")) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Unit must be of type socket, ignoring: %s", k); + continue; + } + + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_WANTS, UNIT_AFTER, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + + r = unit_add_dependency_by_name(UNIT(s), UNIT_TRIGGERED_BY, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + } + + return 0; +} + +int config_parse_bus_name( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!service_name_is_valid(k)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid bus name, ignoring: %s", k); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_service_timeout( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = userdata; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(s); + + /* This is called for two cases: TimeoutSec= and TimeoutStartSec=. */ + + /* Traditionally, these options accepted 0 to disable the timeouts. However, a timeout of 0 suggests it happens + * immediately, hence fix this to become USEC_INFINITY instead. This is in-line with how we internally handle + * all other timeouts. */ + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue); + return 0; + } + + s->start_timeout_defined = true; + s->timeout_start_usec = usec; + + if (streq(lvalue, "TimeoutSec")) + s->timeout_stop_usec = usec; + + return 0; +} + +int config_parse_sec_fix_0( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t *usec = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(usec); + + /* This is pretty much like config_parse_sec(), except that this treats a time of 0 as infinity, for + * compatibility with older versions of systemd where 0 instead of infinity was used as indicator to turn off a + * timeout. */ + + r = parse_sec_fix_0(rvalue, usec); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_user_group( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + char **user = data; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + *user = mfree(*user); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue); + return -ENOEXEC; + } + + if (!valid_user_group_name_or_id(k)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k); + return -ENOEXEC; + } + + return free_and_replace(*user, k); +} + +int config_parse_user_group_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***users = data; + Unit *u = userdata; + const char *p = rvalue; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + *users = strv_free(*users); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax: %s", rvalue); + return -ENOEXEC; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", word); + return -ENOEXEC; + } + + if (!valid_user_group_name_or_id(k)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k); + return -ENOEXEC; + } + + r = strv_push(users, k); + if (r < 0) + return log_oom(); + + k = NULL; + } + + return 0; +} + +int config_parse_working_directory( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + bool missing_ok; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(c); + assert(u); + + if (isempty(rvalue)) { + c->working_directory_home = false; + c->working_directory = mfree(c->working_directory); + return 0; + } + + if (rvalue[0] == '-') { + missing_ok = true; + rvalue++; + } else + missing_ok = false; + + if (streq(rvalue, "~")) { + c->working_directory_home = true; + c->working_directory = mfree(c->working_directory); + } else { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in working directory path '%s'%s: %m", + rvalue, missing_ok ? ", ignoring" : ""); + return missing_ok ? 0 : -ENOEXEC; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue); + if (r < 0) + return missing_ok ? 0 : -ENOEXEC; + + c->working_directory_home = false; + free_and_replace(c->working_directory, k); + } + + c->working_directory_missing_ok = missing_ok; + return 0; +} + +int config_parse_unit_env_file(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***env = data; + Unit *u = userdata; + _cleanup_free_ char *n = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment frees the list */ + *env = strv_free(*env); + return 0; + } + + r = unit_full_printf(u, rvalue, &n); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(n[0] == '-' ? n + 1 : n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_push(env, n); + if (r < 0) + return log_oom(); + + n = NULL; + + return 0; +} + +int config_parse_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + char ***env = data; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *env = strv_free(*env); + return 0; + } + + for (p = rvalue;; ) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + if (u) { + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_assignment_is_valid(k)) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Invalid environment assignment, ignoring: %s", k); + continue; + } + + r = strv_env_replace(env, k); + if (r < 0) + return log_oom(); + + k = NULL; + } +} + +int config_parse_pass_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0, nbufsize = 0; + char*** passenv = data; + const char *p = rvalue; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *passenv = strv_free(*passenv); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + + if (u) { + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_name_is_valid(k)) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Invalid environment name for %s, ignoring: %s", lvalue, k); + continue; + } + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(k); + n[nlen] = NULL; + } + + if (n) { + r = strv_extend_strv(passenv, n, true); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_unset_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0, nbufsize = 0; + char*** unsetenv = data; + const char *p = rvalue; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *unsetenv = strv_free(*unsetenv); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_QUOTES); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + + if (u) { + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_assignment_is_valid(k) && !env_name_is_valid(k)) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "Invalid environment name or assignment %s, ignoring: %s", lvalue, k); + continue; + } + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(k); + n[nlen] = NULL; + } + + if (n) { + r = strv_extend_strv(unsetenv, n, true); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_log_extra_fields( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + const char *p = rvalue; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(c); + + if (isempty(rvalue)) { + exec_context_free_log_extra_fields(c); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + struct iovec *t; + const char *eq; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + + eq = strchr(k, '='); + if (!eq) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Log field lacks '=' character, ignoring: %s", k); + continue; + } + + if (!journal_field_valid(k, eq-k, false)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Log field name is invalid, ignoring: %s", k); + continue; + } + + t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec)); + if (!t) + return log_oom(); + + c->log_extra_fields = t; + c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE_STRING(k); + + k = NULL; + } +} + +int config_parse_unit_condition_path( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Condition **list = data, *c; + ConditionType t = ltype; + bool trigger, negate; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = rvalue[0] == '|'; + if (trigger) + rvalue++; + + negate = rvalue[0] == '!'; + if (negate) + rvalue++; + + r = unit_full_printf(u, rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(p, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + c = condition_new(t, p, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_condition_string( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *s = NULL; + Condition **list = data, *c; + ConditionType t = ltype; + bool trigger, negate; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = rvalue[0] == '|'; + if (trigger) + rvalue++; + + negate = rvalue[0] == '!'; + if (negate) + rvalue++; + + r = unit_full_printf(u, rvalue, &s); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + c = condition_new(t, s, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_condition_null( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Condition **list = data, *c; + bool trigger, negate; + int b; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = rvalue[0] == '|'; + if (trigger) + rvalue++; + + negate = rvalue[0] == '!'; + if (negate) + rvalue++; + + b = parse_boolean(rvalue); + if (b < 0) { + log_syntax(unit, LOG_ERR, filename, line, b, "Failed to parse boolean value in condition, ignoring: %s", rvalue); + return 0; + } + + if (!b) + negate = !negate; + + c = condition_new(CONDITION_NULL, NULL, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_requires_mounts_for( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const char *p = rvalue; + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + for (;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved); + continue; + } + } +} + +int config_parse_documentation(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + int r; + char **a, **b; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + u->documentation = strv_free(u->documentation); + return 0; + } + + r = config_parse_unit_strv_printf(unit, filename, line, section, section_line, lvalue, ltype, + rvalue, data, userdata); + if (r < 0) + return r; + + for (a = b = u->documentation; a && *a; a++) { + + if (documentation_url_is_valid(*a)) + *(b++) = *a; + else { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid URL, ignoring: %s", *a); + free(*a); + } + } + if (b) + *b = NULL; + + return r; +} + +#if HAVE_SECCOMP +int config_parse_syscall_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + bool invert = false; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->syscall_filter = hashmap_free(c->syscall_filter); + c->syscall_whitelist = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->syscall_filter) { + c->syscall_filter = hashmap_new(NULL); + if (!c->syscall_filter) + return log_oom(); + + if (invert) + /* Allow everything but the ones listed */ + c->syscall_whitelist = false; + else { + /* Allow nothing but the ones listed */ + c->syscall_whitelist = true; + + /* Accept default syscalls if we are on a whitelist */ + r = seccomp_parse_syscall_filter("@default", -1, c->syscall_filter, SECCOMP_PARSE_WHITELIST); + if (r < 0) + return r; + } + } + + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL, *name = NULL; + int num; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = parse_syscall_and_errno(word, &name, &num); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse syscall:errno, ignoring: %s", word); + continue; + } + + r = seccomp_parse_syscall_filter_full(name, num, c->syscall_filter, + SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|(invert ? SECCOMP_PARSE_INVERT : 0)|(c->syscall_whitelist ? SECCOMP_PARSE_WHITELIST : 0), + unit, filename, line); + if (r < 0) + return r; + } +} + +int config_parse_syscall_archs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const char *p = rvalue; + Set **archs = data; + int r; + + if (isempty(rvalue)) { + *archs = set_free(*archs); + return 0; + } + + r = set_ensure_allocated(archs, NULL); + if (r < 0) + return log_oom(); + + for (;;) { + _cleanup_free_ char *word = NULL; + uint32_t a; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = seccomp_arch_from_string(word, &a); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse system call architecture \"%s\", ignoring: %m", word); + continue; + } + + r = set_put(*archs, UINT32_TO_PTR(a + 1)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_syscall_errno( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int e; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets to KILL */ + c->syscall_errno = 0; + return 0; + } + + e = parse_errno(rvalue); + if (e <= 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse error number, ignoring: %s", rvalue); + return 0; + } + + c->syscall_errno = e; + return 0; +} + +int config_parse_address_families( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + bool invert = false; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->address_families = set_free(c->address_families); + c->address_families_whitelist = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->address_families) { + c->address_families = set_new(NULL); + if (!c->address_families) + return log_oom(); + + c->address_families_whitelist = !invert; + } + + for (p = rvalue;;) { + _cleanup_free_ char *word = NULL; + int af; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + af = af_from_name(word); + if (af < 0) { + log_syntax(unit, LOG_ERR, filename, line, af, + "Failed to parse address family, ignoring: %s", word); + continue; + } + + /* If we previously wanted to forbid an address family and now + * we want to allow it, then just remove it from the list. + */ + if (!invert == c->address_families_whitelist) { + r = set_put(c->address_families, INT_TO_PTR(af)); + if (r < 0) + return log_oom(); + } else + set_remove(c->address_families, INT_TO_PTR(af)); + } +} + +int config_parse_restrict_namespaces( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + unsigned long flags; + bool invert = false; + int r; + + if (isempty(rvalue)) { + /* Reset to the default. */ + c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL; + return 0; + } + + /* Boolean parameter ignores the previous settings */ + r = parse_boolean(rvalue); + if (r > 0) { + c->restrict_namespaces = 0; + return 0; + } else if (r == 0) { + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + /* Not a boolean argument, in this case it's a list of namespace types. */ + r = namespace_flags_from_string(rvalue, &flags); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue); + return 0; + } + + if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL) + /* Initial assignment. Just set the value. */ + c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags; + else + /* Merge the value with the previous one. */ + SET_FLAG(c->restrict_namespaces, flags, !invert); + + return 0; +} +#endif + +int config_parse_unit_slice( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *k = NULL; + Unit *u = userdata, *slice = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_name_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = manager_load_unit(u->manager, k, NULL, &error, &slice); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to load slice unit %s, ignoring: %s", k, bus_error_message(&error, r)); + return 0; + } + + r = unit_set_slice(u, slice); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to assign slice %s to unit %s, ignoring: %m", slice->id, u->id); + return 0; + } + + return 0; +} + +int config_parse_cpu_quota( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->cpu_quota_per_sec_usec = USEC_INFINITY; + return 0; + } + + r = parse_permille_unbounded(rvalue); + if (r <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid CPU quota '%s', ignoring.", rvalue); + return 0; + } + + c->cpu_quota_per_sec_usec = ((usec_t) r * USEC_PER_SEC) / 1000U; + return 0; +} + +int config_parse_memory_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + uint64_t bytes = CGROUP_LIMIT_MAX; + int r; + + if (!isempty(rvalue) && !streq(rvalue, "infinity")) { + + r = parse_permille(rvalue); + if (r < 0) { + r = parse_size(rvalue, 1024, &bytes); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid memory limit '%s', ignoring: %m", rvalue); + return 0; + } + } else + bytes = physical_memory_scale(r, 1000U); + + if (bytes >= UINT64_MAX || + (bytes <= 0 && !streq(lvalue, "MemorySwapMax"))) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Memory limit '%s' out of range, ignoring.", rvalue); + return 0; + } + } + + if (streq(lvalue, "MemoryMin")) + c->memory_min = bytes; + else if (streq(lvalue, "MemoryLow")) + c->memory_low = bytes; + else if (streq(lvalue, "MemoryHigh")) + c->memory_high = bytes; + else if (streq(lvalue, "MemoryMax")) + c->memory_max = bytes; + else if (streq(lvalue, "MemorySwapMax")) + c->memory_swap_max = bytes; + else if (streq(lvalue, "MemoryLimit")) + c->memory_limit = bytes; + else + return -EINVAL; + + return 0; +} + +int config_parse_tasks_max( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *tasks_max = data, v; + Unit *u = userdata; + int r; + + if (isempty(rvalue)) { + *tasks_max = u ? u->manager->default_tasks_max : UINT64_MAX; + return 0; + } + + if (streq(rvalue, "infinity")) { + *tasks_max = CGROUP_LIMIT_MAX; + return 0; + } + + r = parse_permille(rvalue); + if (r < 0) { + r = safe_atou64(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue); + return 0; + } + } else + v = system_tasks_max_scale(r, 1000U); + + if (v <= 0 || v >= UINT64_MAX) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue); + return 0; + } + + *tasks_max = v; + return 0; +} + +int config_parse_delegate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_delegate) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Delegate= setting not supported for this unit type, ignoring."); + return 0; + } + + /* We either accept a boolean value, which may be used to turn on delegation for all controllers, or turn it + * off for all. Or it takes a list of controller names, in which case we add the specified controllers to the + * mask to delegate. */ + + if (isempty(rvalue)) { + /* An empty string resets controllers and set Delegate=yes. */ + c->delegate = true; + c->delegate_controllers = 0; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + const char *p = rvalue; + CGroupMask mask = 0; + + for (;;) { + _cleanup_free_ char *word = NULL; + CGroupController cc; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + cc = cgroup_controller_from_string(word); + if (cc < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid controller name '%s', ignoring", word); + continue; + } + + mask |= CGROUP_CONTROLLER_TO_MASK(cc); + } + + c->delegate = true; + c->delegate_controllers |= mask; + + } else if (r > 0) { + c->delegate = true; + c->delegate_controllers = _CGROUP_MASK_ALL; + } else { + c->delegate = false; + c->delegate_controllers = 0; + } + + return 0; +} + +int config_parse_device_allow( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupContext *c = data; + const char *p = rvalue; + int r; + + if (isempty(rvalue)) { + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device path and rights from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + if (!STARTSWITH_SET(resolved, "block-", "char-")) { + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (!valid_device_node_path(resolved)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid device node path '%s', ignoring.", resolved); + return 0; + } + } + + if (!isempty(p) && !in_charset(p, "rwm")) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid device rights '%s', ignoring.", p); + return 0; + } + + return cgroup_add_device_allow(c, resolved, p); +} + +int config_parse_io_device_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceWeight *w; + CGroupContext *c = data; + const char *p = rvalue; + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device path and weight from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cg_weight_parse(p, &u); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "IO weight '%s' invalid, ignoring: %m", p); + return 0; + } + + assert(u != CGROUP_WEIGHT_INVALID); + + w = new0(CGroupIODeviceWeight, 1); + if (!w) + return log_oom(); + + w->path = TAKE_PTR(resolved); + w->weight = u; + + LIST_PREPEND(device_weights, c->io_device_weights, w); + return 0; +} + +int config_parse_io_device_latency( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceLatency *l; + CGroupContext *c = data; + const char *p = rvalue; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device path and latency from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (parse_sec(p, &usec) < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse timer value, ignoring: %s", p); + return 0; + } + + l = new0(CGroupIODeviceLatency, 1); + if (!l) + return log_oom(); + + l->path = TAKE_PTR(resolved); + l->target_usec = usec; + + LIST_PREPEND(device_latencies, c->io_device_latencies, l); + return 0; +} + +int config_parse_io_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceLimit *l = NULL, *t; + CGroupContext *c = data; + CGroupIOLimitType type; + const char *p = rvalue; + uint64_t num; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + type = cgroup_io_limit_type_from_string(lvalue); + assert(type >= 0); + + if (isempty(rvalue)) { + LIST_FOREACH(device_limits, l, c->io_device_limits) + l->limits[type] = cgroup_io_limit_defaults[type]; + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (streq("infinity", p)) + num = CGROUP_LIMIT_MAX; + else { + r = parse_size(p, 1000, &num); + if (r < 0 || num <= 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid IO limit '%s', ignoring.", p); + return 0; + } + } + + LIST_FOREACH(device_limits, t, c->io_device_limits) { + if (path_equal(resolved, t->path)) { + l = t; + break; + } + } + + if (!l) { + CGroupIOLimitType ttype; + + l = new0(CGroupIODeviceLimit, 1); + if (!l) + return log_oom(); + + l->path = TAKE_PTR(resolved); + for (ttype = 0; ttype < _CGROUP_IO_LIMIT_TYPE_MAX; ttype++) + l->limits[ttype] = cgroup_io_limit_defaults[ttype]; + + LIST_PREPEND(device_limits, c->io_device_limits, l); + } + + l->limits[type] = num; + + return 0; +} + +int config_parse_blockio_device_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupBlockIODeviceWeight *w; + CGroupContext *c = data; + const char *p = rvalue; + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device node and weight from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cg_blkio_weight_parse(p, &u); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid block IO weight '%s', ignoring: %m", p); + return 0; + } + + assert(u != CGROUP_BLKIO_WEIGHT_INVALID); + + w = new0(CGroupBlockIODeviceWeight, 1); + if (!w) + return log_oom(); + + w->path = TAKE_PTR(resolved); + w->weight = u; + + LIST_PREPEND(device_weights, c->blockio_device_weights, w); + return 0; +} + +int config_parse_blockio_bandwidth( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupBlockIODeviceBandwidth *b = NULL, *t; + CGroupContext *c = data; + const char *p = rvalue; + uint64_t bytes; + bool read; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + read = streq("BlockIOReadBandwidth", lvalue); + + if (isempty(rvalue)) { + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + b->rbps = CGROUP_LIMIT_MAX; + b->wbps = CGROUP_LIMIT_MAX; + } + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = parse_size(p, 1000, &bytes); + if (r < 0 || bytes <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid Block IO Bandwidth '%s', ignoring.", p); + return 0; + } + + LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) { + if (path_equal(resolved, t->path)) { + b = t; + break; + } + } + + if (!t) { + b = new0(CGroupBlockIODeviceBandwidth, 1); + if (!b) + return log_oom(); + + b->path = TAKE_PTR(resolved); + b->rbps = CGROUP_LIMIT_MAX; + b->wbps = CGROUP_LIMIT_MAX; + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, b); + } + + if (read) + b->rbps = bytes; + else + b->wbps = bytes; + + return 0; +} + +int config_parse_job_mode_isolate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + JobMode *m = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse boolean, ignoring: %s", rvalue); + return 0; + } + + log_notice("%s is deprecated. Please use OnFailureJobMode= instead", lvalue); + + *m = r ? JOB_ISOLATE : JOB_REPLACE; + return 0; +} + +int config_parse_exec_directories( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char***rt = data; + Unit *u = userdata; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *rt = strv_free(*rt); + return 0; + } + + for (p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); + continue; + } + + r = path_simplify_and_warn(k, PATH_CHECK_RELATIVE, unit, filename, line, lvalue); + if (r < 0) + continue; + + if (path_startswith(k, "private")) { + log_syntax(unit, LOG_ERR, filename, line, 0, + "%s= path can't be 'private', ignoring assignment: %s", lvalue, word); + continue; + } + + r = strv_push(rt, k); + if (r < 0) + return log_oom(); + k = NULL; + } +} + +int config_parse_set_status( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + size_t l; + const char *word, *state; + int r; + ExitStatusSet *status_set = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + /* Empty assignment resets the list */ + if (isempty(rvalue)) { + exit_status_set_free(status_set); + return 0; + } + + FOREACH_WORD(word, l, rvalue, state) { + _cleanup_free_ char *temp; + int val; + Set **set; + + temp = strndup(word, l); + if (!temp) + return log_oom(); + + r = safe_atoi(temp, &val); + if (r < 0) { + val = signal_from_string(temp); + + if (val <= 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse value, ignoring: %s", word); + continue; + } + set = &status_set->signal; + } else { + if (val < 0 || val > 255) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Value %d is outside range 0-255, ignoring", val); + continue; + } + set = &status_set->status; + } + + r = set_ensure_allocated(set, NULL); + if (r < 0) + return log_oom(); + + r = set_put(*set, INT_TO_PTR(val)); + if (r < 0) + return log_oom(); + } + if (!isempty(state)) + log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring."); + + return 0; +} + +int config_parse_namespace_path_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + char*** sv = data; + const char *p = rvalue; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *sv = strv_free(*sv); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL; + const char *w; + bool ignore_enoent = false, shall_prefix = false; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + + w = word; + if (startswith(w, "-")) { + ignore_enoent = true; + w++; + } + if (startswith(w, "+")) { + shall_prefix = true; + w++; + } + + r = unit_full_printf(u, w, &resolved); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", w); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + joined = strjoin(ignore_enoent ? "-" : "", + shall_prefix ? "+" : "", + resolved); + + r = strv_push(sv, joined); + if (r < 0) + return log_oom(); + + joined = NULL; + } + + return 0; +} + +int config_parse_temporary_filesystems( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + ExecContext *c = data; + const char *p = rvalue; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *path = NULL, *resolved = NULL; + const char *w; + + r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + + w = word; + r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to extract first word, ignoring: %s", word); + continue; + } + if (r == 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid syntax, ignoring: %s", word); + continue; + } + + r = unit_full_printf(u, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", path); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, resolved, w); + if (r < 0) + return log_oom(); + } +} + +int config_parse_bind_paths( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + Unit *u = userdata; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + return 0; + } + + p = rvalue; + for (;;) { + _cleanup_free_ char *source = NULL, *destination = NULL; + _cleanup_free_ char *sresolved = NULL, *dresolved = NULL; + char *s = NULL, *d = NULL; + bool rbind = true, ignore_enoent = false; + + r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_QUOTES|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + + r = unit_full_printf(u, source, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolved unit specifiers in \"%s\", ignoring: %m", source); + continue; + } + + s = sresolved; + if (s[0] == '-') { + ignore_enoent = true; + s++; + } + + r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + /* Optionally, the destination is specified. */ + if (p && p[-1] == ':') { + r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_QUOTES|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Missing argument after ':', ignoring: %s", s); + continue; + } + + r = unit_full_printf(u, destination, &dresolved); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to resolved specifiers in \"%s\", ignoring: %m", destination); + continue; + } + + r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + d = dresolved; + + /* Optionally, there's also a short option string specified */ + if (p && p[-1] == ':') { + _cleanup_free_ char *options = NULL; + + r = extract_first_word(&p, &options, NULL, EXTRACT_QUOTES); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse %s: %s", lvalue, rvalue); + return 0; + } + + if (isempty(options) || streq(options, "rbind")) + rbind = true; + else if (streq(options, "norbind")) + rbind = false; + else { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid option string, ignoring setting: %s", options); + continue; + } + } + } else + d = s; + + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = s, + .destination = d, + .read_only = !!strstr(lvalue, "ReadOnly"), + .recursive = rbind, + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int config_parse_job_timeout_sec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = data; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse JobTimeoutSec= parameter, ignoring: %s", rvalue); + return 0; + } + + /* If the user explicitly changed JobTimeoutSec= also change JobRunningTimeoutSec=, for compatibility with old + * versions. If JobRunningTimeoutSec= was explicitly set, avoid this however as whatever the user picked should + * count. */ + + if (!u->job_running_timeout_set) + u->job_running_timeout = usec; + + u->job_timeout = usec; + + return 0; +} + +int config_parse_job_running_timeout_sec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = data; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse JobRunningTimeoutSec= parameter, ignoring: %s", rvalue); + return 0; + } + + u->job_running_timeout = usec; + u->job_running_timeout_set = true; + + return 0; +} + +int config_parse_emergency_action( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *m = NULL; + EmergencyAction *x = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (unit) + m = ((Unit*) userdata)->manager; + else + m = data; + + r = parse_emergency_action(rvalue, MANAGER_IS_SYSTEM(m), x); + if (r < 0) { + if (r == -EOPNOTSUPP && MANAGER_IS_USER(m)) { + /* Compat mode: remove for systemd 241. */ + + log_syntax(unit, LOG_INFO, filename, line, r, + "%s= in user mode specified as \"%s\", using \"exit-force\" instead.", + lvalue, rvalue); + *x = EMERGENCY_ACTION_EXIT_FORCE; + return 0; + } + + if (r == -EOPNOTSUPP) + log_syntax(unit, LOG_ERR, filename, line, r, + "%s= specified as %s mode action, ignoring: %s", + lvalue, MANAGER_IS_SYSTEM(m) ? "user" : "system", rvalue); + else + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_pid_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL, *n = NULL; + Unit *u = userdata; + char **s = data; + const char *e; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* An empty assignment removes already set value. */ + *s = mfree(*s); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + /* If this is a relative path make it absolute by prefixing the /run */ + n = path_make_absolute(k, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!n) + return log_oom(); + + /* Check that the result is a sensible path */ + r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return r; + + e = path_startswith(n, "/var/run/"); + if (e) { + char *z; + + z = strjoin("/run/", e); + if (!z) + return log_oom(); + + log_syntax(unit, LOG_NOTICE, filename, line, 0, "PIDFile= references path below legacy directory /var/run/, updating %s → %s; please update the unit file accordingly.", n, z); + + free_and_replace(*s, z); + } else + free_and_replace(*s, n); + + return 0; +} + +int config_parse_exit_status( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *exit_status = data, r; + uint8_t u; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(exit_status); + + if (isempty(rvalue)) { + *exit_status = -1; + return 0; + } + + r = safe_atou8(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse exit status '%s', ignoring: %m", rvalue); + return 0; + } + + *exit_status = u; + return 0; +} + +int config_parse_disable_controllers( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + CGroupContext *c = data; + CGroupMask disabled_mask; + + /* 1. If empty, make all controllers eligible for use again. + * 2. If non-empty, merge all listed controllers, space separated. */ + + if (isempty(rvalue)) { + c->disable_controllers = 0; + return 0; + } + + r = cg_mask_from_string(rvalue, &disabled_mask); + if (r < 0 || disabled_mask <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue); + return 0; + } + + c->disable_controllers |= disabled_mask; + + return 0; +} + +#define FOLLOW_MAX 8 + +static int open_follow(char **filename, FILE **_f, Set *names, char **_final) { + char *id = NULL; + unsigned c = 0; + int fd, r; + FILE *f; + + assert(filename); + assert(*filename); + assert(_f); + assert(names); + + /* This will update the filename pointer if the loaded file is + * reached by a symlink. The old string will be freed. */ + + for (;;) { + char *target, *name; + + if (c++ >= FOLLOW_MAX) + return -ELOOP; + + path_simplify(*filename, false); + + /* Add the file name we are currently looking at to + * the names of this unit, but only if it is a valid + * unit name. */ + name = basename(*filename); + if (unit_name_is_valid(name, UNIT_NAME_ANY)) { + + id = set_get(names, name); + if (!id) { + id = strdup(name); + if (!id) + return -ENOMEM; + + r = set_consume(names, id); + if (r < 0) + return r; + } + } + + /* Try to open the file name, but don't if its a symlink */ + fd = open(*filename, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd >= 0) + break; + + if (errno != ELOOP) + return -errno; + + /* Hmm, so this is a symlink. Let's read the name, and follow it manually */ + r = readlink_and_make_absolute(*filename, &target); + if (r < 0) + return r; + + free_and_replace(*filename, target); + } + + f = fdopen(fd, "r"); + if (!f) { + safe_close(fd); + return -errno; + } + + *_f = f; + *_final = id; + + return 0; +} + +static int merge_by_names(Unit **u, Set *names, const char *id) { + char *k; + int r; + + assert(u); + assert(*u); + assert(names); + + /* Let's try to add in all symlink names we found */ + while ((k = set_steal_first(names))) { + + /* First try to merge in the other name into our + * unit */ + r = unit_merge_by_name(*u, k); + if (r < 0) { + Unit *other; + + /* Hmm, we couldn't merge the other unit into + * ours? Then let's try it the other way + * round */ + + /* If the symlink name we are looking at is unit template, then + we must search for instance of this template */ + if (unit_name_is_valid(k, UNIT_NAME_TEMPLATE) && (*u)->instance) { + _cleanup_free_ char *instance = NULL; + + r = unit_name_replace_instance(k, (*u)->instance, &instance); + if (r < 0) + return r; + + other = manager_get_unit((*u)->manager, instance); + } else + other = manager_get_unit((*u)->manager, k); + + free(k); + + if (other) { + r = unit_merge(other, *u); + if (r >= 0) { + *u = other; + return merge_by_names(u, names, NULL); + } + } + + return r; + } + + if (id == k) + unit_choose_id(*u, id); + + free(k); + } + + return 0; +} + +static int load_from_path(Unit *u, const char *path) { + _cleanup_set_free_free_ Set *symlink_names = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *filename = NULL; + char *id = NULL; + Unit *merged; + struct stat st; + int r; + + assert(u); + assert(path); + + symlink_names = set_new(&string_hash_ops); + if (!symlink_names) + return -ENOMEM; + + if (path_is_absolute(path)) { + + filename = strdup(path); + if (!filename) + return -ENOMEM; + + r = open_follow(&filename, &f, symlink_names, &id); + if (r < 0) { + filename = mfree(filename); + if (r != -ENOENT) + return r; + } + + } else { + char **p; + + STRV_FOREACH(p, u->manager->lookup_paths.search_path) { + + /* Instead of opening the path right away, we manually + * follow all symlinks and add their name to our unit + * name set while doing so */ + filename = path_make_absolute(path, *p); + if (!filename) + return -ENOMEM; + + if (u->manager->unit_path_cache && + !set_get(u->manager->unit_path_cache, filename)) + r = -ENOENT; + else + r = open_follow(&filename, &f, symlink_names, &id); + if (r >= 0) + break; + + /* ENOENT means that the file is missing or is a dangling symlink. + * ENOTDIR means that one of paths we expect to be is a directory + * is not a directory, we should just ignore that. + * EACCES means that the directory or file permissions are wrong. + */ + if (r == -EACCES) + log_debug_errno(r, "Cannot access \"%s\": %m", filename); + else if (!IN_SET(r, -ENOENT, -ENOTDIR)) + return r; + + filename = mfree(filename); + /* Empty the symlink names for the next run */ + set_clear_free(symlink_names); + } + } + + if (!filename) + /* Hmm, no suitable file found? */ + return 0; + + if (!unit_type_may_alias(u->type) && set_size(symlink_names) > 1) { + log_unit_warning(u, "Unit type of %s does not support alias names, refusing loading via symlink.", u->id); + return -ELOOP; + } + + merged = u; + r = merge_by_names(&merged, symlink_names, id); + if (r < 0) + return r; + + if (merged != u) { + u->load_state = UNIT_MERGED; + return 0; + } + + if (fstat(fileno(f), &st) < 0) + return -errno; + + if (null_or_empty(&st)) { + u->load_state = UNIT_MASKED; + u->fragment_mtime = 0; + } else { + u->load_state = UNIT_LOADED; + u->fragment_mtime = timespec_load(&st.st_mtim); + + /* Now, parse the file contents */ + r = config_parse(u->id, filename, f, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + CONFIG_PARSE_ALLOW_INCLUDE, u); + if (r < 0) + return r; + } + + free_and_replace(u->fragment_path, filename); + + if (u->source_path) { + if (stat(u->source_path, &st) >= 0) + u->source_mtime = timespec_load(&st.st_mtim); + else + u->source_mtime = 0; + } + + return 0; +} + +int unit_load_fragment(Unit *u) { + int r; + Iterator i; + const char *t; + + assert(u); + assert(u->load_state == UNIT_STUB); + assert(u->id); + + if (u->transient) { + u->load_state = UNIT_LOADED; + return 0; + } + + /* First, try to find the unit under its id. We always look + * for unit files in the default directories, to make it easy + * to override things by placing things in /etc/systemd/system */ + r = load_from_path(u, u->id); + if (r < 0) + return r; + + /* Try to find an alias we can load this with */ + if (u->load_state == UNIT_STUB) { + SET_FOREACH(t, u->names, i) { + + if (t == u->id) + continue; + + r = load_from_path(u, t); + if (r < 0) + return r; + + if (u->load_state != UNIT_STUB) + break; + } + } + + /* And now, try looking for it under the suggested (originally linked) path */ + if (u->load_state == UNIT_STUB && u->fragment_path) { + + r = load_from_path(u, u->fragment_path); + if (r < 0) + return r; + + if (u->load_state == UNIT_STUB) + /* Hmm, this didn't work? Then let's get rid + * of the fragment path stored for us, so that + * we don't point to an invalid location. */ + u->fragment_path = mfree(u->fragment_path); + } + + /* Look for a template */ + if (u->load_state == UNIT_STUB && u->instance) { + _cleanup_free_ char *k = NULL; + + r = unit_name_template(u->id, &k); + if (r < 0) + return r; + + r = load_from_path(u, k); + if (r < 0) { + if (r == -ENOEXEC) + log_unit_notice(u, "Unit configuration has fatal error, unit will not be started."); + return r; + } + + if (u->load_state == UNIT_STUB) { + SET_FOREACH(t, u->names, i) { + _cleanup_free_ char *z = NULL; + + if (t == u->id) + continue; + + r = unit_name_template(t, &z); + if (r < 0) + return r; + + r = load_from_path(u, z); + if (r < 0) + return r; + + if (u->load_state != UNIT_STUB) + break; + } + } + } + + return 0; +} + +void unit_dump_config_items(FILE *f) { + static const struct { + const ConfigParserCallback callback; + const char *rvalue; + } table[] = { + { config_parse_warn_compat, "NOTSUPPORTED" }, + { config_parse_int, "INTEGER" }, + { config_parse_unsigned, "UNSIGNED" }, + { config_parse_iec_size, "SIZE" }, + { config_parse_iec_uint64, "SIZE" }, + { config_parse_si_size, "SIZE" }, + { config_parse_bool, "BOOLEAN" }, + { config_parse_string, "STRING" }, + { config_parse_path, "PATH" }, + { config_parse_unit_path_printf, "PATH" }, + { config_parse_strv, "STRING [...]" }, + { config_parse_exec_nice, "NICE" }, + { config_parse_exec_oom_score_adjust, "OOMSCOREADJUST" }, + { config_parse_exec_io_class, "IOCLASS" }, + { config_parse_exec_io_priority, "IOPRIORITY" }, + { config_parse_exec_cpu_sched_policy, "CPUSCHEDPOLICY" }, + { config_parse_exec_cpu_sched_prio, "CPUSCHEDPRIO" }, + { config_parse_exec_cpu_affinity, "CPUAFFINITY" }, + { config_parse_mode, "MODE" }, + { config_parse_unit_env_file, "FILE" }, + { config_parse_exec_output, "OUTPUT" }, + { config_parse_exec_input, "INPUT" }, + { config_parse_log_facility, "FACILITY" }, + { config_parse_log_level, "LEVEL" }, + { config_parse_exec_secure_bits, "SECUREBITS" }, + { config_parse_capability_set, "BOUNDINGSET" }, + { config_parse_rlimit, "LIMIT" }, + { config_parse_unit_deps, "UNIT [...]" }, + { config_parse_exec, "PATH [ARGUMENT [...]]" }, + { config_parse_service_type, "SERVICETYPE" }, + { config_parse_service_restart, "SERVICERESTART" }, + { config_parse_kill_mode, "KILLMODE" }, + { config_parse_signal, "SIGNAL" }, + { config_parse_socket_listen, "SOCKET [...]" }, + { config_parse_socket_bind, "SOCKETBIND" }, + { config_parse_socket_bindtodevice, "NETWORKINTERFACE" }, + { config_parse_sec, "SECONDS" }, + { config_parse_nsec, "NANOSECONDS" }, + { config_parse_namespace_path_strv, "PATH [...]" }, + { config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" }, + { config_parse_unit_requires_mounts_for, "PATH [...]" }, + { config_parse_exec_mount_flags, "MOUNTFLAG [...]" }, + { config_parse_unit_string_printf, "STRING" }, + { config_parse_trigger_unit, "UNIT" }, + { config_parse_timer, "TIMER" }, + { config_parse_path_spec, "PATH" }, + { config_parse_notify_access, "ACCESS" }, + { config_parse_ip_tos, "TOS" }, + { config_parse_unit_condition_path, "CONDITION" }, + { config_parse_unit_condition_string, "CONDITION" }, + { config_parse_unit_condition_null, "CONDITION" }, + { config_parse_unit_slice, "SLICE" }, + { config_parse_documentation, "URL" }, + { config_parse_service_timeout, "SECONDS" }, + { config_parse_emergency_action, "ACTION" }, + { config_parse_set_status, "STATUS" }, + { config_parse_service_sockets, "SOCKETS" }, + { config_parse_environ, "ENVIRON" }, +#if HAVE_SECCOMP + { config_parse_syscall_filter, "SYSCALLS" }, + { config_parse_syscall_archs, "ARCHS" }, + { config_parse_syscall_errno, "ERRNO" }, + { config_parse_address_families, "FAMILIES" }, + { config_parse_restrict_namespaces, "NAMESPACES" }, +#endif + { config_parse_cpu_shares, "SHARES" }, + { config_parse_cg_weight, "WEIGHT" }, + { config_parse_memory_limit, "LIMIT" }, + { config_parse_device_allow, "DEVICE" }, + { config_parse_device_policy, "POLICY" }, + { config_parse_io_limit, "LIMIT" }, + { config_parse_io_device_weight, "DEVICEWEIGHT" }, + { config_parse_io_device_latency, "DEVICELATENCY" }, + { config_parse_blockio_bandwidth, "BANDWIDTH" }, + { config_parse_blockio_weight, "WEIGHT" }, + { config_parse_blockio_device_weight, "DEVICEWEIGHT" }, + { config_parse_long, "LONG" }, + { config_parse_socket_service, "SERVICE" }, +#if HAVE_SELINUX + { config_parse_exec_selinux_context, "LABEL" }, +#endif + { config_parse_job_mode, "MODE" }, + { config_parse_job_mode_isolate, "BOOLEAN" }, + { config_parse_personality, "PERSONALITY" }, + }; + + const char *prev = NULL; + const char *i; + + assert(f); + + NULSTR_FOREACH(i, load_fragment_gperf_nulstr) { + const char *rvalue = "OTHER", *lvalue; + const ConfigPerfItem *p; + size_t prefix_len; + const char *dot; + unsigned j; + + assert_se(p = load_fragment_gperf_lookup(i, strlen(i))); + + /* Hide legacy settings */ + if (p->parse == config_parse_warn_compat && + p->ltype == DISABLED_LEGACY) + continue; + + for (j = 0; j < ELEMENTSOF(table); j++) + if (p->parse == table[j].callback) { + rvalue = table[j].rvalue; + break; + } + + dot = strchr(i, '.'); + lvalue = dot ? dot + 1 : i; + prefix_len = dot-i; + + if (dot) + if (!prev || !strneq(prev, i, prefix_len+1)) { + if (prev) + fputc('\n', f); + + fprintf(f, "[%.*s]\n", (int) prefix_len, i); + } + + fprintf(f, "%s=%s\n", lvalue, rvalue); + prev = i; + } +} diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h new file mode 100644 index 0000000..e0d3b4e --- /dev/null +++ b/src/core/load-fragment.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "conf-parser.h" +#include "unit.h" + +/* Read service data from .desktop file style configuration fragments */ + +int unit_load_fragment(Unit *u); + +void unit_dump_config_items(FILE *f); + +CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps); +CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_strv_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_documentation); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_listen); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_protocol); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_bind); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust); +CONFIG_PARSER_PROTOTYPE(config_parse_exec); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout); +CONFIG_PARSER_PROTOTYPE(config_parse_service_type); +CONFIG_PARSER_PROTOTYPE(config_parse_service_restart); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_output); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_text); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_data); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_class); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_prio); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_affinity); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_secure_bits); +CONFIG_PARSER_PROTOTYPE(config_parse_capability_set); +CONFIG_PARSER_PROTOTYPE(config_parse_kill_signal); +CONFIG_PARSER_PROTOTYPE(config_parse_final_kill_signal); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_flags); +CONFIG_PARSER_PROTOTYPE(config_parse_timer); +CONFIG_PARSER_PROTOTYPE(config_parse_trigger_unit); +CONFIG_PARSER_PROTOTYPE(config_parse_path_spec); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_service); +CONFIG_PARSER_PROTOTYPE(config_parse_service_sockets); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_env_file); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_tos); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_path); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_null); +CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_notify_access); +CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno); +CONFIG_PARSER_PROTOTYPE(config_parse_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_pass_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_unset_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_slice); +CONFIG_PARSER_PROTOTYPE(config_parse_cg_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares); +CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); +CONFIG_PARSER_PROTOTYPE(config_parse_delegate); +CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); +CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency); +CONFIG_PARSER_PROTOTYPE(config_parse_io_device_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_io_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_device_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_bandwidth); +CONFIG_PARSER_PROTOTYPE(config_parse_job_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_job_mode_isolate); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_selinux_context); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_apparmor_profile); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_smack_process_label); +CONFIG_PARSER_PROTOTYPE(config_parse_address_families); +CONFIG_PARSER_PROTOTYPE(config_parse_runtime_preserve_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_directories); +CONFIG_PARSER_PROTOTYPE(config_parse_set_status); +CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); +CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_home); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_system); +CONFIG_PARSER_PROTOTYPE(config_parse_bus_name); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_utmp_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_working_directory); +CONFIG_PARSER_PROTOTYPE(config_parse_fdname); +CONFIG_PARSER_PROTOTYPE(config_parse_sec_fix_0); +CONFIG_PARSER_PROTOTYPE(config_parse_user_group); +CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv); +CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces); +CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); +CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_pid_file); +CONFIG_PARSER_PROTOTYPE(config_parse_exit_status); +CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers); + +/* gperf prototypes */ +const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); +extern const char load_fragment_gperf_nulstr[]; diff --git a/src/core/locale-setup.c b/src/core/locale-setup.c new file mode 100644 index 0000000..aa4a89c --- /dev/null +++ b/src/core/locale-setup.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "env-file.h" +#include "env-util.h" +#include "locale-setup.h" +#include "locale-util.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "strv.h" +#include "util.h" +#include "virt.h" + +int locale_setup(char ***environment) { + _cleanup_(locale_variables_freep) char *variables[_VARIABLE_LC_MAX] = {}; + _cleanup_strv_free_ char **add = NULL; + LocaleVariable i; + int r; + + r = proc_cmdline_get_key_many(PROC_CMDLINE_STRIP_RD_PREFIX, + "locale.LANG", &variables[VARIABLE_LANG], + "locale.LANGUAGE", &variables[VARIABLE_LANGUAGE], + "locale.LC_CTYPE", &variables[VARIABLE_LC_CTYPE], + "locale.LC_NUMERIC", &variables[VARIABLE_LC_NUMERIC], + "locale.LC_TIME", &variables[VARIABLE_LC_TIME], + "locale.LC_COLLATE", &variables[VARIABLE_LC_COLLATE], + "locale.LC_MONETARY", &variables[VARIABLE_LC_MONETARY], + "locale.LC_MESSAGES", &variables[VARIABLE_LC_MESSAGES], + "locale.LC_PAPER", &variables[VARIABLE_LC_PAPER], + "locale.LC_NAME", &variables[VARIABLE_LC_NAME], + "locale.LC_ADDRESS", &variables[VARIABLE_LC_ADDRESS], + "locale.LC_TELEPHONE", &variables[VARIABLE_LC_TELEPHONE], + "locale.LC_MEASUREMENT", &variables[VARIABLE_LC_MEASUREMENT], + "locale.LC_IDENTIFICATION", &variables[VARIABLE_LC_IDENTIFICATION]); + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Failed to read /proc/cmdline: %m"); + + /* Hmm, nothing set on the kernel cmd line? Then let's try /etc/locale.conf */ + if (r <= 0) { + r = parse_env_file(NULL, "/etc/locale.conf", + "LANG", &variables[VARIABLE_LANG], + "LANGUAGE", &variables[VARIABLE_LANGUAGE], + "LC_CTYPE", &variables[VARIABLE_LC_CTYPE], + "LC_NUMERIC", &variables[VARIABLE_LC_NUMERIC], + "LC_TIME", &variables[VARIABLE_LC_TIME], + "LC_COLLATE", &variables[VARIABLE_LC_COLLATE], + "LC_MONETARY", &variables[VARIABLE_LC_MONETARY], + "LC_MESSAGES", &variables[VARIABLE_LC_MESSAGES], + "LC_PAPER", &variables[VARIABLE_LC_PAPER], + "LC_NAME", &variables[VARIABLE_LC_NAME], + "LC_ADDRESS", &variables[VARIABLE_LC_ADDRESS], + "LC_TELEPHONE", &variables[VARIABLE_LC_TELEPHONE], + "LC_MEASUREMENT", &variables[VARIABLE_LC_MEASUREMENT], + "LC_IDENTIFICATION", &variables[VARIABLE_LC_IDENTIFICATION]); + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Failed to read /etc/locale.conf: %m"); + } + + for (i = 0; i < _VARIABLE_LC_MAX; i++) { + char *s; + + if (!variables[i]) + continue; + + s = strjoin(locale_variable_to_string(i), "=", variables[i]); + if (!s) + return -ENOMEM; + + if (strv_consume(&add, s) < 0) + return -ENOMEM; + } + + if (strv_isempty(add)) { + /* If no locale is configured then default to compile-time default. */ + + add = strv_new("LANG=" SYSTEMD_DEFAULT_LOCALE); + if (!add) + return -ENOMEM; + } + + if (strv_isempty(*environment)) + strv_free_and_replace(*environment, add); + else { + char **merged; + + merged = strv_env_merge(2, *environment, add); + if (!merged) + return -ENOMEM; + + strv_free_and_replace(*environment, merged); + } + + return 0; +} diff --git a/src/core/locale-setup.h b/src/core/locale-setup.h new file mode 100644 index 0000000..01fadd0 --- /dev/null +++ b/src/core/locale-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int locale_setup(char ***environment); diff --git a/src/core/loopback-setup.c b/src/core/loopback-setup.c new file mode 100644 index 0000000..f613db8 --- /dev/null +++ b/src/core/loopback-setup.c @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <net/if.h> +#include <stdlib.h> + +#include "sd-netlink.h" + +#include "loopback-setup.h" +#include "missing.h" +#include "netlink-util.h" + +#define LOOPBACK_SETUP_TIMEOUT_USEC (5 * USEC_PER_SEC) + +struct state { + unsigned n_messages; + int rcode; + const char *error_message; + const char *success_message; +}; + +static int generic_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + struct state *s = userdata; + int r; + + assert(s); + assert(s->n_messages > 0); + s->n_messages--; + + errno = 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0) + log_debug_errno(r, "%s: %m", s->error_message); + else + log_debug("%s", s->success_message); + + s->rcode = r; + return 0; +} + +static int start_loopback(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_SETLINK, LOOPBACK_IFINDEX); + if (r < 0) + return r; + + r = sd_rtnl_message_link_set_flags(req, IFF_UP, IFF_UP); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, LOOPBACK_SETUP_TIMEOUT_USEC, "systemd-start-loopback"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int add_ipv4_address(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_prefixlen(req, 8); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_flags(req, IFA_F_PERMANENT); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(req, IFA_LOCAL, &(struct in_addr) { .s_addr = htobe32(INADDR_LOOPBACK) } ); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv4"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int add_ipv6_address(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET6); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_prefixlen(req, 128); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_flags(req, IFA_F_PERMANENT); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(req, IFA_LOCAL, &in6addr_loopback); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv6"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static bool check_loopback(sd_netlink *rtnl) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + unsigned flags; + int r; + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, LOOPBACK_IFINDEX); + if (r < 0) + return false; + + r = sd_netlink_call(rtnl, req, USEC_INFINITY, &reply); + if (r < 0) + return false; + + r = sd_rtnl_message_link_get_flags(reply, &flags); + if (r < 0) + return false; + + return flags & IFF_UP; +} + +int loopback_setup(void) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + struct state state_4 = { + .error_message = "Failed to add address 127.0.0.1 to loopback interface", + .success_message = "Successfully added address 127.0.0.1 to loopback interface", + }, state_6 = { + .error_message = "Failed to add address ::1 to loopback interface", + .success_message = "Successfully added address ::1 to loopback interface", + }, state_up = { + .error_message = "Failed to bring loopback interface up", + .success_message = "Successfully brought loopback interface up", + }; + int r; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to open netlink: %m"); + + /* Note that we add the IP addresses here explicitly even though the kernel does that too implicitly when + * setting up the loopback device. The reason we do this here a second time (and possibly race against the + * kernel) is that we want to synchronously wait until the IP addresses are set up correctly, see + * + * https://github.com/systemd/systemd/issues/5641 */ + + r = add_ipv4_address(rtnl, &state_4); + if (r < 0) + return log_error_errno(r, "Failed to enqueue IPv4 loopback address add request: %m"); + + r = add_ipv6_address(rtnl, &state_6); + if (r < 0) + return log_error_errno(r, "Failed to enqueue IPv6 loopback address add request: %m"); + + r = start_loopback(rtnl, &state_up); + if (r < 0) + return log_error_errno(r, "Failed to enqueue loopback interface start request: %m"); + + while (state_4.n_messages + state_6.n_messages + state_up.n_messages > 0) { + r = sd_netlink_wait(rtnl, LOOPBACK_SETUP_TIMEOUT_USEC); + if (r < 0) + return log_error_errno(r, "Failed to wait for netlink event: %m"); + + r = sd_netlink_process(rtnl, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to process netlink event: %m"); + } + + /* Note that we don't really care whether the addresses could be added or not */ + if (state_up.rcode != 0) { + /* If we lack the permissions to configure the loopback device, + * but we find it to be already configured, let's exit cleanly, + * in order to supported unprivileged containers. */ + if (state_up.rcode == -EPERM && check_loopback(rtnl)) + return 0; + + return log_warning_errno(state_up.rcode, "Failed to configure loopback device: %m"); + } + + return 0; +} diff --git a/src/core/loopback-setup.h b/src/core/loopback-setup.h new file mode 100644 index 0000000..c0eea10 --- /dev/null +++ b/src/core/loopback-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int loopback_setup(void); diff --git a/src/core/machine-id-setup.c b/src/core/machine-id-setup.c new file mode 100644 index 0000000..aae5480 --- /dev/null +++ b/src/core/machine-id-setup.c @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <fcntl.h> +#include <sched.h> +#include <sys/mount.h> +#include <unistd.h> + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "id128-util.h" +#include "log.h" +#include "machine-id-setup.h" +#include "macro.h" +#include "mkdir.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "umask-util.h" +#include "util.h" +#include "virt.h" + +static int generate_machine_id(const char *root, sd_id128_t *ret) { + const char *dbus_machine_id; + _cleanup_close_ int fd = -1; + int r; + + assert(ret); + + /* First, try reading the D-Bus machine id, unless it is a symlink */ + dbus_machine_id = prefix_roota(root, "/var/lib/dbus/machine-id"); + fd = open(dbus_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd >= 0) { + if (id128_read_fd(fd, ID128_PLAIN, ret) >= 0) { + log_info("Initializing machine ID from D-Bus machine ID."); + return 0; + } + + fd = safe_close(fd); + } + + if (isempty(root)) { + /* If that didn't work, see if we are running in a container, + * and a machine ID was passed in via $container_uuid the way + * libvirt/LXC does it */ + + if (detect_container() > 0) { + _cleanup_free_ char *e = NULL; + + if (getenv_for_pid(1, "container_uuid", &e) > 0 && + sd_id128_from_string(e, ret) >= 0) { + log_info("Initializing machine ID from container UUID."); + return 0; + } + + } else if (detect_vm() == VIRTUALIZATION_KVM) { + + /* If we are not running in a container, see if we are + * running in qemu/kvm and a machine ID was passed in + * via -uuid on the qemu/kvm command line */ + + if (id128_read("/sys/class/dmi/id/product_uuid", ID128_UUID, ret) >= 0) { + log_info("Initializing machine ID from KVM UUID."); + return 0; + } + } + } + + /* If that didn't work, generate a random machine id */ + r = sd_id128_randomize(ret); + if (r < 0) + return log_error_errno(r, "Failed to generate randomized machine ID: %m"); + + log_info("Initializing machine ID from random generator."); + return 0; +} + +int machine_id_setup(const char *root, sd_id128_t machine_id, sd_id128_t *ret) { + const char *etc_machine_id, *run_machine_id; + _cleanup_close_ int fd = -1; + bool writable; + int r; + + etc_machine_id = prefix_roota(root, "/etc/machine-id"); + + RUN_WITH_UMASK(0000) { + /* We create this 0444, to indicate that this isn't really + * something you should ever modify. Of course, since the file + * will be owned by root it doesn't matter much, but maybe + * people look. */ + + (void) mkdir_parents(etc_machine_id, 0755); + fd = open(etc_machine_id, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY, 0444); + if (fd < 0) { + int old_errno = errno; + + fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + if (old_errno == EROFS && errno == ENOENT) + return log_error_errno(errno, + "System cannot boot: Missing /etc/machine-id and /etc is mounted read-only.\n" + "Booting up is supported only when:\n" + "1) /etc/machine-id exists and is populated.\n" + "2) /etc/machine-id exists and is empty.\n" + "3) /etc/machine-id is missing and /etc is writable.\n"); + else + return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id); + } + + writable = false; + } else + writable = true; + } + + /* A we got a valid machine ID argument, that's what counts */ + if (sd_id128_is_null(machine_id)) { + + /* Try to read any existing machine ID */ + if (id128_read_fd(fd, ID128_PLAIN, ret) >= 0) + return 0; + + /* Hmm, so, the id currently stored is not useful, then let's generate one */ + r = generate_machine_id(root, &machine_id); + if (r < 0) + return r; + } + + if (writable) { + if (lseek(fd, 0, SEEK_SET) == (off_t) -1) + return log_error_errno(errno, "Failed to seek %s: %m", etc_machine_id); + + if (ftruncate(fd, 0) < 0) + return log_error_errno(errno, "Failed to truncate %s: %m", etc_machine_id); + + if (id128_write_fd(fd, ID128_PLAIN, machine_id, true) >= 0) + goto finish; + } + + fd = safe_close(fd); + + /* Hmm, we couldn't write it? So let's write it to /run/machine-id as a replacement */ + + run_machine_id = prefix_roota(root, "/run/machine-id"); + + RUN_WITH_UMASK(0022) + r = id128_write(run_machine_id, ID128_PLAIN, machine_id, false); + if (r < 0) { + (void) unlink(run_machine_id); + return log_error_errno(r, "Cannot write %s: %m", run_machine_id); + } + + /* And now, let's mount it over */ + if (mount(run_machine_id, etc_machine_id, NULL, MS_BIND, NULL) < 0) { + (void) unlink_noerrno(run_machine_id); + return log_error_errno(errno, "Failed to mount %s: %m", etc_machine_id); + } + + log_info("Installed transient %s file.", etc_machine_id); + + /* Mark the mount read-only */ + if (mount(NULL, etc_machine_id, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL) < 0) + log_warning_errno(errno, "Failed to make transient %s read-only, ignoring: %m", etc_machine_id); + +finish: + if (ret) + *ret = machine_id; + + return 0; +} + +int machine_id_commit(const char *root) { + _cleanup_close_ int fd = -1, initial_mntns_fd = -1; + const char *etc_machine_id; + sd_id128_t id; + int r; + + /* Replaces a tmpfs bind mount of /etc/machine-id by a proper file, atomically. For this, the umount is removed + * in a mount namespace, a new file is created at the right place. Afterwards the mount is also removed in the + * original mount namespace, thus revealing the file that was just created. */ + + etc_machine_id = prefix_roota(root, "/etc/machine-id"); + + r = path_is_mount_point(etc_machine_id, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", etc_machine_id); + if (r == 0) { + log_debug("%s is not a mount point. Nothing to do.", etc_machine_id); + return 0; + } + + /* Read existing machine-id */ + fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id); + + r = fd_is_temporary_fs(fd); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is on a temporary file system: %m", etc_machine_id); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EROFS), + "%s is not on a temporary file system.", + etc_machine_id); + + r = id128_read_fd(fd, ID128_PLAIN, &id); + if (r < 0) + return log_error_errno(r, "We didn't find a valid machine ID in %s: %m", etc_machine_id); + + fd = safe_close(fd); + + /* Store current mount namespace */ + r = namespace_open(0, NULL, &initial_mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Can't fetch current mount namespace: %m"); + + /* Switch to a new mount namespace, isolate ourself and unmount etc_machine_id in our new namespace */ + if (unshare(CLONE_NEWNS) < 0) + return log_error_errno(errno, "Failed to enter new namespace: %m"); + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) + return log_error_errno(errno, "Couldn't make-rslave / mountpoint in our private namespace: %m"); + + if (umount(etc_machine_id) < 0) + return log_error_errno(errno, "Failed to unmount transient %s file in our private namespace: %m", etc_machine_id); + + /* Update a persistent version of etc_machine_id */ + r = id128_write(etc_machine_id, ID128_PLAIN, id, true); + if (r < 0) + return log_error_errno(r, "Cannot write %s. This is mandatory to get a persistent machine ID: %m", etc_machine_id); + + /* Return to initial namespace and proceed a lazy tmpfs unmount */ + r = namespace_enter(-1, initial_mntns_fd, -1, -1, -1); + if (r < 0) + return log_warning_errno(r, "Failed to switch back to initial mount namespace: %m.\nWe'll keep transient %s file until next reboot.", etc_machine_id); + + if (umount2(etc_machine_id, MNT_DETACH) < 0) + return log_warning_errno(errno, "Failed to unmount transient %s file: %m.\nWe keep that mount until next reboot.", etc_machine_id); + + return 0; +} diff --git a/src/core/machine-id-setup.h b/src/core/machine-id-setup.h new file mode 100644 index 0000000..d6ac62a --- /dev/null +++ b/src/core/machine-id-setup.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int machine_id_commit(const char *root); +int machine_id_setup(const char *root, sd_id128_t requested, sd_id128_t *ret); diff --git a/src/core/macros.systemd.in b/src/core/macros.systemd.in new file mode 100644 index 0000000..d9e7f57 --- /dev/null +++ b/src/core/macros.systemd.in @@ -0,0 +1,145 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. + +# RPM macros for packages installing systemd unit files + +%_unitdir @systemunitdir@ +%_userunitdir @userunitdir@ +%_presetdir @systempresetdir@ +%_userpresetdir @userpresetdir@ +%_udevhwdbdir @udevhwdbdir@ +%_udevrulesdir @udevrulesdir@ +%_journalcatalogdir @catalogdir@ +%_binfmtdir @binfmtdir@ +%_sysctldir @sysctldir@ +%_sysusersdir @sysusersdir@ +%_tmpfilesdir @tmpfilesdir@ +%_environmentdir @environmentdir@ +%_modulesloaddir @modulesloaddir@ +%_modprobedir @modprobedir@ +%_systemdgeneratordir @systemgeneratordir@ +%_systemdusergeneratordir @usergeneratordir@ +%_systemd_system_env_generator_dir @systemenvgeneratordir@ +%_systemd_user_env_generator_dir @userenvgeneratordir@ + +# Because we had one release with a typo... +# This is temporary (Remove after systemd 240 is released) +%_environmnentdir %{warn:Use %%_environmentdir instead}%_environmentdir + +%systemd_requires \ +Requires(post): systemd \ +Requires(preun): systemd \ +Requires(postun): systemd \ +%{nil} + +%systemd_ordering \ +OrderWithRequires(post): systemd \ +OrderWithRequires(preun): systemd \ +OrderWithRequires(postun): systemd \ +%{nil} + +%systemd_post() \ +if [ $1 -eq 1 ] ; then \ + # Initial installation \ + systemctl --no-reload preset %{?*} >/dev/null 2>&1 || : \ +fi \ +%{nil} + +%systemd_user_post() %{expand:%systemd_post \\--global %%{?*}} + +%systemd_preun() \ +if [ $1 -eq 0 ] ; then \ + # Package removal, not upgrade \ + systemctl --no-reload disable --now %{?*} >/dev/null 2>&1 || : \ +fi \ +%{nil} + +%systemd_user_preun() \ +if [ $1 -eq 0 ] ; then \ + # Package removal, not upgrade \ + systemctl --global disable %{?*} >/dev/null 2>&1 || : \ +fi \ +%{nil} + +%systemd_postun() %{nil} + +%systemd_user_postun() %{nil} + +%systemd_postun_with_restart() \ +if [ $1 -ge 1 ] ; then \ + # Package upgrade, not uninstall \ + systemctl try-restart %{?*} >/dev/null 2>&1 || : \ +fi \ +%{nil} + +%systemd_user_postun_with_restart() %{nil} + +%udev_hwdb_update() %{nil} + +%udev_rules_update() %{nil} + +%journal_catalog_update() %{nil} + +# Deprecated. Use %tmpfiles_create_package instead +%tmpfiles_create() \ +systemd-tmpfiles --create %{?*} >/dev/null 2>&1 || : \ +%{nil} + +# Deprecated. Use %sysusers_create_package instead +%sysusers_create() \ +systemd-sysusers %{?*} >/dev/null 2>&1 || : \ +%{nil} + +%sysusers_create_inline() \ +systemd-sysusers - <<SYSTEMD_INLINE_EOF >/dev/null 2>&1 || : \ +%{?*} \ +SYSTEMD_INLINE_EOF \ +%{nil} + +# This should be used by package installation scripts which require users or +# groups to be present before the files installed by the package are present on +# disk (for example because some files are owned by those users or groups). +# +# Example: +# Source1: %{name}-sysusers.conf +# ... +# %install +# install -D %SOURCE1 %{buildroot}%{_sysusersdir}/%{name}.conf +# %pre +# %sysusers_create_package %{name} %SOURCE1 +# %files +# %{_sysusersdir}/%{name}.conf +%sysusers_create_package() \ +systemd-sysusers --replace=%_sysusersdir/%1.conf - <<SYSTEMD_INLINE_EOF >/dev/null 2>&1 || : \ +%(cat %2) \ +SYSTEMD_INLINE_EOF \ +%{nil} + +# This may be used by package installation scripts to create files according to +# their tmpfiles configuration from a package installation script, even before +# the files of that package are installed on disk. +# +# Example: +# Source1: %{name}-tmpfiles.conf +# ... +# %install +# install -D %SOURCE1 %{buildroot}%{_tmpfilesdir}/%{name}.conf +# %pre +# %tmpfiles_create_package %{name} %SOURCE1 +# %files +# %{_tmpfilesdir}/%{name}.conf +%tmpfiles_create_package() \ +systemd-tmpfiles --replace=%_tmpfilesdir/%1.conf --create - <<SYSTEMD_INLINE_EOF >/dev/null 2>&1 || : \ +%(cat %2) \ +SYSTEMD_INLINE_EOF \ +%{nil} + +%sysctl_apply() \ +@rootlibexecdir@/systemd-sysctl %{?*} >/dev/null 2>&1 || : \ +%{nil} + +%binfmt_apply() \ +@rootlibexecdir@/systemd-binfmt %{?*} >/dev/null 2>&1 || : \ +%{nil} diff --git a/src/core/main.c b/src/core/main.c new file mode 100644 index 0000000..561f956 --- /dev/null +++ b/src/core/main.c @@ -0,0 +1,2690 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <signal.h> +#include <stdio.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/prctl.h> +#include <sys/reboot.h> +#include <sys/stat.h> +#include <unistd.h> +#if HAVE_SECCOMP +#include <seccomp.h> +#endif +#if HAVE_VALGRIND_VALGRIND_H +#include <valgrind/valgrind.h> +#endif + +#include "sd-bus.h" +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "architecture.h" +#include "build.h" +#include "bus-error.h" +#include "bus-util.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "clock-util.h" +#include "conf-parser.h" +#include "cpu-set-util.h" +#include "dbus.h" +#include "dbus-manager.h" +#include "def.h" +#include "emergency-action.h" +#include "env-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fdset.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hostname-setup.h" +#include "ima-setup.h" +#include "killall.h" +#include "kmod-setup.h" +#include "load-fragment.h" +#include "log.h" +#include "loopback-setup.h" +#include "machine-id-setup.h" +#include "manager.h" +#include "missing.h" +#include "mount-setup.h" +#include "os-util.h" +#include "pager.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "raw-clone.h" +#include "rlimit-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "selinux-setup.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "smack-setup.h" +#include "special.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "switch-root.h" +#include "sysctl-util.h" +#include "terminal-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "util.h" +#include "virt.h" +#include "watchdog.h" + +#if HAS_FEATURE_ADDRESS_SANITIZER +#include <sanitizer/lsan_interface.h> +#endif + +static enum { + ACTION_RUN, + ACTION_HELP, + ACTION_VERSION, + ACTION_TEST, + ACTION_DUMP_CONFIGURATION_ITEMS, + ACTION_DUMP_BUS_PROPERTIES, +} arg_action = ACTION_RUN; +static char *arg_default_unit = NULL; +static bool arg_system = false; +static bool arg_dump_core = true; +static int arg_crash_chvt = -1; +static bool arg_crash_shell = false; +static bool arg_crash_reboot = false; +static char *arg_confirm_spawn = NULL; +static ShowStatus arg_show_status = _SHOW_STATUS_INVALID; +static bool arg_switched_root = false; +static PagerFlags arg_pager_flags = 0; +static bool arg_service_watchdogs = true; +static ExecOutput arg_default_std_output = EXEC_OUTPUT_JOURNAL; +static ExecOutput arg_default_std_error = EXEC_OUTPUT_INHERIT; +static usec_t arg_default_restart_usec = DEFAULT_RESTART_USEC; +static usec_t arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC; +static usec_t arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC; +static usec_t arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL; +static unsigned arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST; +static usec_t arg_runtime_watchdog = 0; +static usec_t arg_shutdown_watchdog = 10 * USEC_PER_MINUTE; +static char *arg_early_core_pattern = NULL; +static char *arg_watchdog_device = NULL; +static char **arg_default_environment = NULL; +static struct rlimit *arg_default_rlimit[_RLIMIT_MAX] = {}; +static uint64_t arg_capability_bounding_set = CAP_ALL; +static bool arg_no_new_privs = false; +static nsec_t arg_timer_slack_nsec = NSEC_INFINITY; +static usec_t arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE; +static Set* arg_syscall_archs = NULL; +static FILE* arg_serialization = NULL; +static int arg_default_cpu_accounting = -1; +static bool arg_default_io_accounting = false; +static bool arg_default_ip_accounting = false; +static bool arg_default_blockio_accounting = false; +static bool arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT; +static bool arg_default_tasks_accounting = true; +static uint64_t arg_default_tasks_max = UINT64_MAX; +static sd_id128_t arg_machine_id = {}; +static EmergencyAction arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE; + +_noreturn_ static void freeze_or_exit_or_reboot(void) { + + /* If we are running in a contianer, let's prefer exiting, after all we can propagate an exit code to the + * container manager, and thus inform it that something went wrong. */ + if (detect_container() > 0) { + log_emergency("Exiting PID 1..."); + exit(EXIT_EXCEPTION); + } + + if (arg_crash_reboot) { + log_notice("Rebooting in 10s..."); + (void) sleep(10); + + log_notice("Rebooting now..."); + (void) reboot(RB_AUTOBOOT); + log_emergency_errno(errno, "Failed to reboot: %m"); + } + + log_emergency("Freezing execution."); + freeze(); +} + +_noreturn_ static void crash(int sig) { + struct sigaction sa; + pid_t pid; + + if (getpid_cached() != 1) + /* Pass this on immediately, if this is not PID 1 */ + (void) raise(sig); + else if (!arg_dump_core) + log_emergency("Caught <%s>, not dumping core.", signal_to_string(sig)); + else { + sa = (struct sigaction) { + .sa_handler = nop_signal_handler, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + + /* We want to wait for the core process, hence let's enable SIGCHLD */ + (void) sigaction(SIGCHLD, &sa, NULL); + + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_emergency_errno(errno, "Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig)); + else if (pid == 0) { + /* Enable default signal handler for core dump */ + + sa = (struct sigaction) { + .sa_handler = SIG_DFL, + }; + (void) sigaction(sig, &sa, NULL); + + /* Don't limit the coredump size */ + (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)); + + /* Just to be sure... */ + (void) chdir("/"); + + /* Raise the signal again */ + pid = raw_getpid(); + (void) kill(pid, sig); /* raise() would kill the parent */ + + assert_not_reached("We shouldn't be here..."); + _exit(EXIT_EXCEPTION); + } else { + siginfo_t status; + int r; + + /* Order things nicely. */ + r = wait_for_terminate(pid, &status); + if (r < 0) + log_emergency_errno(r, "Caught <%s>, waitpid() failed: %m", signal_to_string(sig)); + else if (status.si_code != CLD_DUMPED) + log_emergency("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).", + signal_to_string(sig), + pid, sigchld_code_to_string(status.si_code), + status.si_status, + strna(status.si_code == CLD_EXITED + ? exit_status_to_string(status.si_status, EXIT_STATUS_MINIMAL) + : signal_to_string(status.si_status))); + else + log_emergency("Caught <%s>, dumped core as pid "PID_FMT".", signal_to_string(sig), pid); + } + } + + if (arg_crash_chvt >= 0) + (void) chvt(arg_crash_chvt); + + sa = (struct sigaction) { + .sa_handler = SIG_IGN, + .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART, + }; + + /* Let the kernel reap children for us */ + (void) sigaction(SIGCHLD, &sa, NULL); + + if (arg_crash_shell) { + log_notice("Executing crash shell in 10s..."); + (void) sleep(10); + + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_emergency_errno(errno, "Failed to fork off crash shell: %m"); + else if (pid == 0) { + (void) setsid(); + (void) make_console_stdio(); + (void) rlimit_nofile_safe(); + (void) execle("/bin/sh", "/bin/sh", NULL, environ); + + log_emergency_errno(errno, "execle() failed: %m"); + _exit(EXIT_EXCEPTION); + } else { + log_info("Spawned crash shell as PID "PID_FMT".", pid); + (void) wait_for_terminate(pid, NULL); + } + } + + freeze_or_exit_or_reboot(); +} + +static void install_crash_handler(void) { + static const struct sigaction sa = { + .sa_handler = crash, + .sa_flags = SA_NODEFER, /* So that we can raise the signal again from the signal handler */ + }; + int r; + + /* We ignore the return value here, since, we don't mind if we + * cannot set up a crash handler */ + r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER, -1); + if (r < 0) + log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m"); +} + +static int console_setup(void) { + _cleanup_close_ int tty_fd = -1; + int r; + + tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (tty_fd < 0) + return log_error_errno(tty_fd, "Failed to open /dev/console: %m"); + + /* We don't want to force text mode. plymouth may be showing + * pictures already from initrd. */ + r = reset_terminal_fd(tty_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to reset /dev/console: %m"); + + return 0; +} + +static int parse_crash_chvt(const char *value) { + int b; + + if (safe_atoi(value, &arg_crash_chvt) >= 0) + return 0; + + b = parse_boolean(value); + if (b < 0) + return b; + + if (b > 0) + arg_crash_chvt = 0; /* switch to where kmsg goes */ + else + arg_crash_chvt = -1; /* turn off switching */ + + return 0; +} + +static int parse_confirm_spawn(const char *value, char **console) { + char *s; + int r; + + r = value ? parse_boolean(value) : 1; + if (r == 0) { + *console = NULL; + return 0; + } + + if (r > 0) /* on with default tty */ + s = strdup("/dev/console"); + else if (is_path(value)) /* on with fully qualified path */ + s = strdup(value); + else /* on with only a tty file name, not a fully qualified path */ + s = strjoin("/dev/", value); + if (!s) + return -ENOMEM; + + *console = s; + return 0; +} + +static int set_machine_id(const char *m) { + sd_id128_t t; + assert(m); + + if (sd_id128_from_string(m, &t) < 0) + return -EINVAL; + + if (sd_id128_is_null(t)) + return -EINVAL; + + arg_machine_id = t; + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + + int r; + + assert(key); + + if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value); + else if (in_initrd() == !!startswith(key, "rd.")) { + if (free_and_strdup(&arg_default_unit, value) < 0) + return log_oom(); + } + + } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value); + else + arg_dump_core = r; + + } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (path_is_absolute(value)) + (void) parse_path_argument_and_warn(value, false, &arg_early_core_pattern); + else + log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) { + + if (!value) + arg_crash_chvt = 0; /* turn on */ + else { + r = parse_crash_chvt(value); + if (r < 0) + log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value); + } + + } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value); + else + arg_crash_shell = r; + + } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value); + else + arg_crash_reboot = r; + + } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) { + char *s; + + r = parse_confirm_spawn(value, &s); + if (r < 0) + log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value); + else + free_and_replace(arg_confirm_spawn, s); + + } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value); + else + arg_service_watchdogs = r; + + } else if (proc_cmdline_key_streq(key, "systemd.show_status")) { + + if (value) { + r = parse_show_status(value, &arg_show_status); + if (r < 0) + log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value); + } else + arg_show_status = SHOW_STATUS_YES; + + } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = exec_output_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value); + else + arg_default_std_output = r; + + } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = exec_output_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value); + else + arg_default_std_error = r; + + } else if (streq(key, "systemd.setenv")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (env_assignment_is_valid(value)) { + char **env; + + env = strv_env_set(arg_default_environment, value); + if (!env) + return log_oom(); + + arg_default_environment = env; + } else + log_warning("Environment variable name '%s' is not valid. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = set_machine_id(value); + if (r < 0) + log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value); + + } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &arg_default_timeout_start_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value); + + if (arg_default_timeout_start_usec <= 0) + arg_default_timeout_start_usec = USEC_INFINITY; + + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + (void) parse_path_argument_and_warn(value, false, &arg_watchdog_device); + + } else if (streq(key, "quiet") && !value) { + + if (arg_show_status == _SHOW_STATUS_INVALID) + arg_show_status = SHOW_STATUS_AUTO; + + } else if (streq(key, "debug") && !value) { + + /* Note that log_parse_environment() handles 'debug' + * too, and sets the log level to LOG_DEBUG. */ + + if (detect_container() > 0) + log_set_target(LOG_TARGET_CONSOLE); + + } else if (!value) { + const char *target; + + /* SysV compatibility */ + target = runlevel_to_target(key); + if (target) + return free_and_strdup(&arg_default_unit, target); + } + + return 0; +} + +#define DEFINE_SETTER(name, func, descr) \ + static int name(const char *unit, \ + const char *filename, \ + unsigned line, \ + const char *section, \ + unsigned section_line, \ + const char *lvalue, \ + int ltype, \ + const char *rvalue, \ + void *data, \ + void *userdata) { \ + \ + int r; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + \ + r = func(rvalue); \ + if (r < 0) \ + log_syntax(unit, LOG_ERR, filename, line, r, \ + "Invalid " descr "'%s': %m", \ + rvalue); \ + \ + return 0; \ + } + +DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level"); +DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target"); +DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color" ); +DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location"); + +static int config_parse_cpu_affinity2( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_cpu_free_ cpu_set_t *c = NULL; + int ncpus; + + ncpus = parse_cpu_set_and_warn(rvalue, &c, unit, filename, line, lvalue); + if (ncpus < 0) + return ncpus; + + if (sched_setaffinity(0, CPU_ALLOC_SIZE(ncpus), c) < 0) + log_warning_errno(errno, "Failed to set CPU affinity: %m"); + + return 0; +} + +static int config_parse_show_status( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int k; + ShowStatus *b = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + k = parse_show_status(rvalue, b); + if (k < 0) { + log_syntax(unit, LOG_ERR, filename, line, k, "Failed to parse show status setting, ignoring: %s", rvalue); + return 0; + } + + return 0; +} + +static int config_parse_output_restricted( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecOutput t, *eo = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + t = exec_output_from_string(rvalue); + if (t < 0) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse output type, ignoring: %s", rvalue); + return 0; + } + + if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Standard output types socket, fd:, file:, append: are not supported as defaults, ignoring: %s", rvalue); + return 0; + } + + *eo = t; + return 0; +} + +static int config_parse_crash_chvt( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_crash_chvt(rvalue); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse CrashChangeVT= setting, ignoring: %s", rvalue); + return 0; + } + + return 0; +} + +static int parse_config_file(void) { + + const ConfigTableItem items[] = { + { "Manager", "LogLevel", config_parse_level2, 0, NULL }, + { "Manager", "LogTarget", config_parse_target, 0, NULL }, + { "Manager", "LogColor", config_parse_color, 0, NULL }, + { "Manager", "LogLocation", config_parse_location, 0, NULL }, + { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core }, + { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, NULL }, + { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, NULL }, + { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell }, + { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot }, + { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status }, + { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, NULL }, + { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, + { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog }, + { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_shutdown_watchdog }, + { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device }, + { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set }, + { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs }, +#if HAVE_SECCOMP + { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs }, +#endif + { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec }, + { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_default_timer_accuracy_usec }, + { "Manager", "DefaultStandardOutput", config_parse_output_restricted,0, &arg_default_std_output }, + { "Manager", "DefaultStandardError", config_parse_output_restricted,0, &arg_default_std_error }, + { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec }, + { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec }, + { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec }, + { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */ + { "Manager", "DefaultStartLimitIntervalSec",config_parse_sec, 0, &arg_default_start_limit_interval }, + { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_default_start_limit_burst }, + { "Manager", "DefaultEnvironment", config_parse_environ, 0, &arg_default_environment }, + { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_default_rlimit }, + { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_default_rlimit }, + { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_default_rlimit }, + { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_default_rlimit }, + { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_default_rlimit }, + { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_default_rlimit }, + { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_default_rlimit }, + { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_default_rlimit }, + { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_default_rlimit }, + { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_default_rlimit }, + { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_default_rlimit }, + { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_default_rlimit }, + { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_default_rlimit }, + { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit }, + { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit }, + { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit }, + { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting }, + { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting }, + { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting }, + { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting }, + { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting }, + { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting }, + { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max }, + { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action }, + {} + }; + + const char *fn, *conf_dirs_nulstr; + + fn = arg_system ? + PKGSYSCONFDIR "/system.conf" : + PKGSYSCONFDIR "/user.conf"; + + conf_dirs_nulstr = arg_system ? + CONF_PATHS_NULSTR("systemd/system.conf.d") : + CONF_PATHS_NULSTR("systemd/user.conf.d"); + + (void) config_parse_many_nulstr(fn, conf_dirs_nulstr, "Manager\0", config_item_table_lookup, items, CONFIG_PARSE_WARN, NULL); + + /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we used USEC_INFINITY + * like everywhere else. */ + if (arg_default_timeout_start_usec <= 0) + arg_default_timeout_start_usec = USEC_INFINITY; + if (arg_default_timeout_stop_usec <= 0) + arg_default_timeout_stop_usec = USEC_INFINITY; + + return 0; +} + +static void set_manager_defaults(Manager *m) { + + assert(m); + + /* Propagates the various default unit property settings into the manager object, i.e. properties that do not + * affect the manager itself, but are just what newly allocated units will have set if they haven't set + * anything else. (Also see set_manager_settings() for the settings that affect the manager's own behaviour) */ + + m->default_timer_accuracy_usec = arg_default_timer_accuracy_usec; + m->default_std_output = arg_default_std_output; + m->default_std_error = arg_default_std_error; + m->default_timeout_start_usec = arg_default_timeout_start_usec; + m->default_timeout_stop_usec = arg_default_timeout_stop_usec; + m->default_restart_usec = arg_default_restart_usec; + m->default_start_limit_interval = arg_default_start_limit_interval; + m->default_start_limit_burst = arg_default_start_limit_burst; + + /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU + * controller to be enabled, so the default is to enable it unless we got told otherwise. */ + if (arg_default_cpu_accounting >= 0) + m->default_cpu_accounting = arg_default_cpu_accounting; + else + m->default_cpu_accounting = cpu_accounting_is_cheap(); + + m->default_io_accounting = arg_default_io_accounting; + m->default_ip_accounting = arg_default_ip_accounting; + m->default_blockio_accounting = arg_default_blockio_accounting; + m->default_memory_accounting = arg_default_memory_accounting; + m->default_tasks_accounting = arg_default_tasks_accounting; + m->default_tasks_max = arg_default_tasks_max; + + (void) manager_set_default_rlimits(m, arg_default_rlimit); + + (void) manager_default_environment(m); + (void) manager_transient_environment_add(m, arg_default_environment); +} + +static void set_manager_settings(Manager *m) { + + assert(m); + + /* Propagates the various manager settings into the manager object, i.e. properties that effect the manager + * itself (as opposed to just being inherited into newly allocated units, see set_manager_defaults() above). */ + + m->confirm_spawn = arg_confirm_spawn; + m->service_watchdogs = arg_service_watchdogs; + m->runtime_watchdog = arg_runtime_watchdog; + m->shutdown_watchdog = arg_shutdown_watchdog; + m->cad_burst_action = arg_cad_burst_action; + + manager_set_show_status(m, arg_show_status); +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_LOG_LEVEL = 0x100, + ARG_LOG_TARGET, + ARG_LOG_COLOR, + ARG_LOG_LOCATION, + ARG_UNIT, + ARG_SYSTEM, + ARG_USER, + ARG_TEST, + ARG_NO_PAGER, + ARG_VERSION, + ARG_DUMP_CONFIGURATION_ITEMS, + ARG_DUMP_BUS_PROPERTIES, + ARG_DUMP_CORE, + ARG_CRASH_CHVT, + ARG_CRASH_SHELL, + ARG_CRASH_REBOOT, + ARG_CONFIRM_SPAWN, + ARG_SHOW_STATUS, + ARG_DESERIALIZE, + ARG_SWITCHED_ROOT, + ARG_DEFAULT_STD_OUTPUT, + ARG_DEFAULT_STD_ERROR, + ARG_MACHINE_ID, + ARG_SERVICE_WATCHDOGS, + }; + + static const struct option options[] = { + { "log-level", required_argument, NULL, ARG_LOG_LEVEL }, + { "log-target", required_argument, NULL, ARG_LOG_TARGET }, + { "log-color", optional_argument, NULL, ARG_LOG_COLOR }, + { "log-location", optional_argument, NULL, ARG_LOG_LOCATION }, + { "unit", required_argument, NULL, ARG_UNIT }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "user", no_argument, NULL, ARG_USER }, + { "test", no_argument, NULL, ARG_TEST }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS }, + { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES }, + { "dump-core", optional_argument, NULL, ARG_DUMP_CORE }, + { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT }, + { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL }, + { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT }, + { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN }, + { "show-status", optional_argument, NULL, ARG_SHOW_STATUS }, + { "deserialize", required_argument, NULL, ARG_DESERIALIZE }, + { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT }, + { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, }, + { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, }, + { "machine-id", required_argument, NULL, ARG_MACHINE_ID }, + { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS }, + {} + }; + + int c, r; + + assert(argc >= 1); + assert(argv); + + if (getpid_cached() == 1) + opterr = 0; + + while ((c = getopt_long(argc, argv, "hDbsz:", options, NULL)) >= 0) + + switch (c) { + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg); + + break; + + case ARG_LOG_COLOR: + + if (optarg) { + r = log_show_color_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log color setting \"%s\": %m", + optarg); + } else + log_show_color(true); + + break; + + case ARG_LOG_LOCATION: + if (optarg) { + r = log_show_location_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log location setting \"%s\": %m", + optarg); + } else + log_show_location(true); + + break; + + case ARG_DEFAULT_STD_OUTPUT: + r = exec_output_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m", + optarg); + arg_default_std_output = r; + break; + + case ARG_DEFAULT_STD_ERROR: + r = exec_output_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m", + optarg); + arg_default_std_error = r; + break; + + case ARG_UNIT: + r = free_and_strdup(&arg_default_unit, optarg); + if (r < 0) + return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg); + + break; + + case ARG_SYSTEM: + arg_system = true; + break; + + case ARG_USER: + arg_system = false; + break; + + case ARG_TEST: + arg_action = ACTION_TEST; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_VERSION: + arg_action = ACTION_VERSION; + break; + + case ARG_DUMP_CONFIGURATION_ITEMS: + arg_action = ACTION_DUMP_CONFIGURATION_ITEMS; + break; + + case ARG_DUMP_BUS_PROPERTIES: + arg_action = ACTION_DUMP_BUS_PROPERTIES; + break; + + case ARG_DUMP_CORE: + if (!optarg) + arg_dump_core = true; + else { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse dump core boolean: \"%s\": %m", + optarg); + arg_dump_core = r; + } + break; + + case ARG_CRASH_CHVT: + r = parse_crash_chvt(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m", + optarg); + break; + + case ARG_CRASH_SHELL: + if (!optarg) + arg_crash_shell = true; + else { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m", + optarg); + arg_crash_shell = r; + } + break; + + case ARG_CRASH_REBOOT: + if (!optarg) + arg_crash_reboot = true; + else { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m", + optarg); + arg_crash_reboot = r; + } + break; + + case ARG_CONFIRM_SPAWN: + arg_confirm_spawn = mfree(arg_confirm_spawn); + + r = parse_confirm_spawn(optarg, &arg_confirm_spawn); + if (r < 0) + return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m", + optarg); + break; + + case ARG_SERVICE_WATCHDOGS: + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse service watchdogs boolean: \"%s\": %m", + optarg); + arg_service_watchdogs = r; + break; + + case ARG_SHOW_STATUS: + if (optarg) { + r = parse_show_status(optarg, &arg_show_status); + if (r < 0) + return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m", + optarg); + } else + arg_show_status = SHOW_STATUS_YES; + break; + + case ARG_DESERIALIZE: { + int fd; + FILE *f; + + r = safe_atoi(optarg, &fd); + if (r < 0) + log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg); + if (fd < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid deserialize fd: %d", + fd); + + (void) fd_cloexec(fd, true); + + f = fdopen(fd, "r"); + if (!f) + return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd); + + safe_fclose(arg_serialization); + arg_serialization = f; + + break; + } + + case ARG_SWITCHED_ROOT: + arg_switched_root = true; + break; + + case ARG_MACHINE_ID: + r = set_machine_id(optarg); + if (r < 0) + return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg); + break; + + case 'h': + arg_action = ACTION_HELP; + break; + + case 'D': + log_set_max_level(LOG_DEBUG); + break; + + case 'b': + case 's': + case 'z': + /* Just to eat away the sysvinit kernel + * cmdline args without getopt() error + * messages that we'll parse in + * parse_proc_cmdline_word() or ignore. */ + + case '?': + if (getpid_cached() != 1) + return -EINVAL; + else + return 0; + + default: + assert_not_reached("Unhandled option code."); + } + + if (optind < argc && getpid_cached() != 1) { + /* Hmm, when we aren't run as init system + * let's complain about excess arguments */ + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Excess arguments."); + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "Starts up and maintains the system or user services.\n\n" + " -h --help Show this help\n" + " --version Show version\n" + " --test Determine startup sequence, dump it and exit\n" + " --no-pager Do not pipe output into a pager\n" + " --dump-configuration-items Dump understood unit configuration items\n" + " --dump-bus-properties Dump exposed bus properties\n" + " --unit=UNIT Set default unit\n" + " --system Run a system instance, even if PID != 1\n" + " --user Run a user instance\n" + " --dump-core[=BOOL] Dump core on crash\n" + " --crash-vt=NR Change to specified VT on crash\n" + " --crash-reboot[=BOOL] Reboot on crash\n" + " --crash-shell[=BOOL] Run shell on crash\n" + " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n" + " --show-status[=BOOL] Show status updates on the console during bootup\n" + " --log-target=TARGET Set log target (console, journal, kmsg, journal-or-kmsg, null)\n" + " --log-level=LEVEL Set log level (debug, info, notice, warning, err, crit, alert, emerg)\n" + " --log-color[=BOOL] Highlight important log messages\n" + " --log-location[=BOOL] Include code location in log messages\n" + " --default-standard-output= Set default standard output for services\n" + " --default-standard-error= Set default standard error output for services\n" + "\nSee the %s for details.\n" + , program_invocation_short_name + , link + ); + + return 0; +} + +static int prepare_reexecute( + Manager *m, + FILE **ret_f, + FDSet **ret_fds, + bool switching_root) { + + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + assert(ret_f); + assert(ret_fds); + + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + + /* Make sure nothing is really destructed when we shut down */ + m->n_reloading++; + bus_manager_send_reloading(m, true); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + r = manager_serialize(m, f, fds, switching_root); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) == (off_t) -1) + return log_error_errno(errno, "Failed to rewind serialization fd: %m"); + + r = fd_cloexec(fileno(f), false); + if (r < 0) + return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m"); + + r = fdset_cloexec(fds, false); + if (r < 0) + return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m"); + + *ret_f = TAKE_PTR(f); + *ret_fds = TAKE_PTR(fds); + + return 0; +} + +static void bump_file_max_and_nr_open(void) { + + /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file + * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting + * them and limiting them in another two layers of limits is unnecessary and just complicates things. This + * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft + + * hard) the only ones that really matter. */ + +#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN + _cleanup_free_ char *t = NULL; + int r; +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX + /* I so wanted to use STRINGIFY(ULONG_MAX) here, but alas we can't as glibc/gcc define that as + * "(0x7fffffffffffffffL * 2UL + 1UL)". Seriously. 😢 */ + if (asprintf(&t, "%lu\n", ULONG_MAX) < 0) { + log_oom(); + return; + } + + r = sysctl_write("fs/file-max", t); + if (r < 0) + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m"); +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX && BUMP_PROC_SYS_FS_NR_OPEN + t = mfree(t); +#endif + +#if BUMP_PROC_SYS_FS_NR_OPEN + int v = INT_MAX; + + /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they + * are. The expression by which the maximum is determined is dependent on the architecture, and is something we + * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since + * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with + * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel + * APIs are kernel APIs, so what do can we do... 🤯 */ + + for (;;) { + int k; + + v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */ + if (v < 1024) { + log_warning("Can't bump fs.nr_open, value too small."); + break; + } + + k = read_nr_open(); + if (k < 0) { + log_error_errno(k, "Failed to read fs.nr_open: %m"); + break; + } + if (k >= v) { /* Already larger */ + log_debug("Skipping bump, value is already larger."); + break; + } + + if (asprintf(&t, "%i\n", v) < 0) { + log_oom(); + return; + } + + r = sysctl_write("fs/nr_open", t); + t = mfree(t); + if (r == -EINVAL) { + log_debug("Couldn't write fs.nr_open as %i, halving it.", v); + v /= 2; + continue; + } + if (r < 0) { + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m"); + break; + } + + log_debug("Successfully bumped fs.nr_open to %i", v); + break; + } +#endif +} + +static int bump_rlimit_nofile(struct rlimit *saved_rlimit) { + struct rlimit new_rlimit; + int r, nr; + + assert(saved_rlimit); + + /* Save the original RLIMIT_NOFILE so that we can reset it later when transitioning from the initrd to the main + * systemd or suchlike. */ + if (getrlimit(RLIMIT_NOFILE, saved_rlimit) < 0) + return log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m"); + + /* Get the underlying absolute limit the kernel enforces */ + nr = read_nr_open(); + + /* Make sure forked processes get limits based on the original kernel setting */ + if (!arg_default_rlimit[RLIMIT_NOFILE]) { + struct rlimit *rl; + + rl = newdup(struct rlimit, saved_rlimit, 1); + if (!rl) + return log_oom(); + + /* Bump the hard limit for system services to a substantially higher value. The default hard limit + * current kernels set is pretty low (4K), mostly for historical reasons. According to kernel + * developers, the fd handling in recent kernels has been optimized substantially enough, so that we + * can bump the limit now, without paying too high a price in memory or performance. Note however that + * we only bump the hard limit, not the soft limit. That's because select() works the way it works, and + * chokes on fds >= 1024. If we'd bump the soft limit globally, it might accidentally happen to + * unexpecting programs that they get fds higher than what they can process using select(). By only + * bumping the hard limit but leaving the low limit as it is we avoid this pitfall: programs that are + * written by folks aware of the select() problem in mind (and thus use poll()/epoll instead of + * select(), the way everybody should) can explicitly opt into high fds by bumping their soft limit + * beyond 1024, to the hard limit we pass. */ + if (arg_system) + rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE)); + + /* If for some reason we were invoked with a soft limit above 1024 (which should never + * happen!, but who knows what we get passed in from pam_limit when invoked as --user + * instance), then lower what we pass on to not confuse our children */ + rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE); + + arg_default_rlimit[RLIMIT_NOFILE] = rl; + } + + /* Calculate the new limits to use for us. Never lower from what we inherited. */ + new_rlimit = (struct rlimit) { + .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur), + .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max), + }; + + /* Shortcut if nothing changes. */ + if (saved_rlimit->rlim_max >= new_rlimit.rlim_max && + saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) { + log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping."); + return 0; + } + + /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for + * both hard and soft. */ + r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m"); + + return 0; +} + +static int bump_rlimit_memlock(struct rlimit *saved_rlimit) { + struct rlimit new_rlimit; + int r; + + assert(saved_rlimit); + + /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK which should + * normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's bump + * the value high enough for our user. */ + + if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0) + return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m"); + + /* Pass the original value down to invoked processes */ + if (!arg_default_rlimit[RLIMIT_MEMLOCK]) { + struct rlimit *rl; + + rl = newdup(struct rlimit, saved_rlimit, 1); + if (!rl) + return log_oom(); + + arg_default_rlimit[RLIMIT_MEMLOCK] = rl; + } + + /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t + * must be unsigned, hence this is a given, but let's make this clear here. */ + assert_cc(RLIM_INFINITY > 0); + + new_rlimit = (struct rlimit) { + .rlim_cur = MAX(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur), + .rlim_max = MAX(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max), + }; + + if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur && + saved_rlimit->rlim_cur >= new_rlimit.rlim_max) { + log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping."); + return 0; + } + + r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m"); + + return 0; +} + +static void test_usr(void) { + + /* Check that /usr is not a separate fs */ + + if (dir_is_empty("/usr") <= 0) + return; + + log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. " + "Some things will probably break (sometimes even silently) in mysterious ways. " + "Consult http://freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information."); +} + +static int enforce_syscall_archs(Set *archs) { +#if HAVE_SECCOMP + int r; + + if (!is_seccomp_available()) + return 0; + + r = seccomp_restrict_archs(arg_syscall_archs); + if (r < 0) + return log_error_errno(r, "Failed to enforce system call architecture restrication: %m"); +#endif + return 0; +} + +static int status_welcome(void) { + _cleanup_free_ char *pretty_name = NULL, *ansi_color = NULL; + int r; + + if (IN_SET(arg_show_status, SHOW_STATUS_NO, SHOW_STATUS_AUTO)) + return 0; + + r = parse_os_release(NULL, + "PRETTY_NAME", &pretty_name, + "ANSI_COLOR", &ansi_color, + NULL); + if (r < 0) + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to read os-release file, ignoring: %m"); + + if (log_get_show_color()) + return status_printf(NULL, 0, + "\nWelcome to \x1B[%sm%s\x1B[0m!\n", + isempty(ansi_color) ? "1" : ansi_color, + isempty(pretty_name) ? "Linux" : pretty_name); + else + return status_printf(NULL, 0, + "\nWelcome to %s!\n", + isempty(pretty_name) ? "Linux" : pretty_name); +} + +static int write_container_id(void) { + const char *c; + int r; + + c = getenv("container"); + if (isempty(c)) + return 0; + + RUN_WITH_UMASK(0022) + r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE); + if (r < 0) + return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m"); + + return 1; +} + +static int bump_unix_max_dgram_qlen(void) { + _cleanup_free_ char *qlen = NULL; + unsigned long v; + int r; + + /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set the value + * really really early during boot, so that it is actually applied to all our sockets, including the + * $NOTIFY_SOCKET one. */ + + r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen); + if (r < 0) + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to read AF_UNIX datagram queue length, ignoring: %m"); + + r = safe_atolu(qlen, &v); + if (r < 0) + return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen); + + if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN) + return 0; + + r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN); + if (r < 0) + return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to bump AF_UNIX datagram queue length, ignoring: %m"); + + return 1; +} + +static int fixup_environment(void) { + _cleanup_free_ char *term = NULL; + const char *t; + int r; + + /* Only fix up the environment when we are started as PID 1 */ + if (getpid_cached() != 1) + return 0; + + /* We expect the environment to be set correctly if run inside a container. */ + if (detect_container() > 0) + return 0; + + /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the backend + * device used by the console. We try to make a better guess here since some consoles might not have support + * for color mode for example. + * + * However if TERM was configured through the kernel command line then leave it alone. */ + r = proc_cmdline_get_key("TERM", 0, &term); + if (r < 0) + return r; + + t = term ?: default_term_for_tty("/dev/console"); + + if (setenv("TERM", t, 1) < 0) + return -errno; + + return 0; +} + +static void redirect_telinit(int argc, char *argv[]) { + + /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */ + +#if HAVE_SYSV_COMPAT + if (getpid_cached() == 1) + return; + + if (!strstr(program_invocation_short_name, "init")) + return; + + execv(SYSTEMCTL_BINARY_PATH, argv); + log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m"); + exit(EXIT_FAILURE); +#endif +} + +static int become_shutdown( + const char *shutdown_verb, + int retval) { + + char log_level[DECIMAL_STR_MAX(int) + 1], + exit_code[DECIMAL_STR_MAX(uint8_t) + 1], + timeout[DECIMAL_STR_MAX(usec_t) + 1]; + + const char* command_line[13] = { + SYSTEMD_SHUTDOWN_BINARY_PATH, + shutdown_verb, + "--timeout", timeout, + "--log-level", log_level, + "--log-target", + }; + + _cleanup_strv_free_ char **env_block = NULL; + size_t pos = 7; + int r; + + assert(shutdown_verb); + assert(!command_line[pos]); + env_block = strv_copy(environ); + + xsprintf(log_level, "%d", log_get_max_level()); + xsprintf(timeout, "%" PRI_USEC "us", arg_default_timeout_stop_usec); + + switch (log_get_target()) { + + case LOG_TARGET_KMSG: + case LOG_TARGET_JOURNAL_OR_KMSG: + case LOG_TARGET_SYSLOG_OR_KMSG: + command_line[pos++] = "kmsg"; + break; + + case LOG_TARGET_NULL: + command_line[pos++] = "null"; + break; + + case LOG_TARGET_CONSOLE: + default: + command_line[pos++] = "console"; + break; + }; + + if (log_get_show_color()) + command_line[pos++] = "--log-color"; + + if (log_get_show_location()) + command_line[pos++] = "--log-location"; + + if (streq(shutdown_verb, "exit")) { + command_line[pos++] = "--exit-code"; + command_line[pos++] = exit_code; + xsprintf(exit_code, "%d", retval); + } + + assert(pos < ELEMENTSOF(command_line)); + + if (streq(shutdown_verb, "reboot") && + arg_shutdown_watchdog > 0 && + arg_shutdown_watchdog != USEC_INFINITY) { + + char *e; + + /* If we reboot let's set the shutdown + * watchdog and tell the shutdown binary to + * repeatedly ping it */ + r = watchdog_set_timeout(&arg_shutdown_watchdog); + watchdog_close(r < 0); + + /* Tell the binary how often to ping, ignore failure */ + if (asprintf(&e, "WATCHDOG_USEC="USEC_FMT, arg_shutdown_watchdog) > 0) + (void) strv_consume(&env_block, e); + + if (arg_watchdog_device && + asprintf(&e, "WATCHDOG_DEVICE=%s", arg_watchdog_device) > 0) + (void) strv_consume(&env_block, e); + } else + watchdog_close(true); + + /* Avoid the creation of new processes forked by the + * kernel; at this point, we will not listen to the + * signals anyway */ + if (detect_container() <= 0) + (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER); + + execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block); + return -errno; +} + +static void initialize_clock(void) { + int r; + + if (clock_is_localtime(NULL) > 0) { + int min; + + /* + * The very first call of settimeofday() also does a time warp in the kernel. + * + * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to take care + * of maintaining the RTC and do all adjustments. This matches the behavior of Windows, which leaves + * the RTC alone if the registry tells that the RTC runs in UTC. + */ + r = clock_set_timezone(&min); + if (r < 0) + log_error_errno(r, "Failed to apply local time delta, ignoring: %m"); + else + log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min); + + } else if (!in_initrd()) { + /* + * Do a dummy very first call to seal the kernel's time warp magic. + * + * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with LOCAL, but the + * real system could be set up that way. In such case, we need to delay the time-warp or the sealing + * until we reach the real system. + * + * Do no set the kernel's timezone. The concept of local time cannot be supported reliably, the time + * will jump or be incorrect at every daylight saving time change. All kernel local time concepts will + * be treated as UTC that way. + */ + (void) clock_reset_timewarp(); + } + + r = clock_apply_epoch(); + if (r < 0) + log_error_errno(r, "Current system time is before build time, but cannot correct: %m"); + else if (r > 0) + log_info("System time before build time, advancing clock."); +} + +static void initialize_coredump(bool skip_setup) { +#if ENABLE_COREDUMP + if (getpid_cached() != 1) + return; + + /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour the limit) + * will process core dumps for system services by default. */ + if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0) + log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m"); + + /* But at the same time, turn off the core_pattern logic by default, so that no + * coredumps are stored until the systemd-coredump tool is enabled via + * sysctl. However it can be changed via the kernel command line later so core + * dumps can still be generated during early startup and in initramfs. */ + if (!skip_setup) + disable_coredumps(); +#endif +} + +static void initialize_core_pattern(bool skip_setup) { + int r; + + if (skip_setup || !arg_early_core_pattern) + return; + + if (getpid_cached() != 1) + return; + + r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern); +} + +static void do_reexecute( + int argc, + char *argv[], + const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock, + FDSet *fds, + const char *switch_root_dir, + const char *switch_root_init, + const char **ret_error_message) { + + unsigned i, j, args_size; + const char **args; + int r; + + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + assert(ret_error_message); + + /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get rebooted while + * we do that */ + watchdog_close(true); + + /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass + * the kernel default to its child processes */ + if (saved_rlimit_nofile->rlim_cur != 0) + (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile); + if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY) + (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock); + + if (switch_root_dir) { + /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the + * SIGCHLD for them after deserializing. */ + broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec); + + /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */ + r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE); + if (r < 0) + log_error_errno(r, "Failed to switch root, trying to continue: %m"); + } + + args_size = MAX(6, argc+1); + args = newa(const char*, args_size); + + if (!switch_root_init) { + char sfd[DECIMAL_STR_MAX(int) + 1]; + + /* First try to spawn ourselves with the right path, and with full serialization. We do this only if + * the user didn't specify an explicit init to spawn. */ + + assert(arg_serialization); + assert(fds); + + xsprintf(sfd, "%i", fileno(arg_serialization)); + + i = 0; + args[i++] = SYSTEMD_BINARY_PATH; + if (switch_root_dir) + args[i++] = "--switched-root"; + args[i++] = arg_system ? "--system" : "--user"; + args[i++] = "--deserialize"; + args[i++] = sfd; + args[i++] = NULL; + + assert(i <= args_size); + + /* + * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do this is on + * its own on exec(), but it will do it on exit(). Hence, to ensure we get a summary here, fork() off + * a child, let it exit() cleanly, so that it prints the summary, and wait() for it in the parent, + * before proceeding into the exec(). + */ + valgrind_summary_hack(); + + (void) execv(args[0], (char* const*) args); + log_debug_errno(errno, "Failed to execute our own binary, trying fallback: %m"); + } + + /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and envp[]. (Well, + * modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[], but let's hope that + * doesn't matter.) */ + + arg_serialization = safe_fclose(arg_serialization); + fds = fdset_free(fds); + + /* Reopen the console */ + (void) make_console_stdio(); + + for (j = 1, i = 1; j < (unsigned) argc; j++) + args[i++] = argv[j]; + args[i++] = NULL; + assert(i <= args_size); + + /* Reenable any blocked signals, especially important if we switch from initial ramdisk to init=... */ + (void) reset_all_signal_handlers(); + (void) reset_signal_mask(); + (void) rlimit_nofile_safe(); + + if (switch_root_init) { + args[0] = switch_root_init; + (void) execv(args[0], (char* const*) args); + log_warning_errno(errno, "Failed to execute configured init, trying fallback: %m"); + } + + args[0] = "/sbin/init"; + (void) execv(args[0], (char* const*) args); + r = -errno; + + manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, + "Failed to execute /sbin/init"); + + if (r == -ENOENT) { + log_warning("No /sbin/init, trying fallback"); + + args[0] = "/bin/sh"; + args[1] = NULL; + (void) execv(args[0], (char* const*) args); + log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m"); + } else + log_warning_errno(r, "Failed to execute /sbin/init, giving up: %m"); + + *ret_error_message = "Failed to execute fallback shell"; +} + +static int invoke_main_loop( + Manager *m, + bool *ret_reexecute, + int *ret_retval, /* Return parameters relevant for shutting down */ + const char **ret_shutdown_verb, /* … */ + FDSet **ret_fds, /* Return parameters for reexecuting */ + char **ret_switch_root_dir, /* … */ + char **ret_switch_root_init, /* … */ + const char **ret_error_message) { + + int r; + + assert(m); + assert(ret_reexecute); + assert(ret_retval); + assert(ret_shutdown_verb); + assert(ret_fds); + assert(ret_switch_root_dir); + assert(ret_switch_root_init); + assert(ret_error_message); + + for (;;) { + r = manager_loop(m); + if (r < 0) { + *ret_error_message = "Failed to run main loop"; + return log_emergency_errno(r, "Failed to run main loop: %m"); + } + + switch ((ManagerObjective) r) { + + case MANAGER_RELOAD: { + LogTarget saved_log_target; + int saved_log_level; + + log_info("Reloading."); + + /* First, save any overridden log level/target, then parse the configuration file, which might + * change the log level to new settings. */ + + saved_log_level = m->log_level_overridden ? log_get_max_level() : -1; + saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID; + + r = parse_config_file(); + if (r < 0) + log_warning_errno(r, "Failed to parse config file, ignoring: %m"); + + set_manager_defaults(m); + + if (saved_log_level >= 0) + manager_override_log_level(m, saved_log_level); + if (saved_log_target >= 0) + manager_override_log_target(m, saved_log_target); + + r = manager_reload(m); + if (r < 0) + /* Reloading failed before the point of no return. Let's continue running as if nothing happened. */ + m->objective = MANAGER_OK; + + break; + } + + case MANAGER_REEXECUTE: + + r = prepare_reexecute(m, &arg_serialization, ret_fds, false); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + + log_notice("Reexecuting."); + + *ret_reexecute = true; + *ret_retval = EXIT_SUCCESS; + *ret_shutdown_verb = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return 0; + + case MANAGER_SWITCH_ROOT: + if (!m->switch_root_init) { + r = prepare_reexecute(m, &arg_serialization, ret_fds, true); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + } else + *ret_fds = NULL; + + log_notice("Switching root."); + + *ret_reexecute = true; + *ret_retval = EXIT_SUCCESS; + *ret_shutdown_verb = NULL; + + /* Steal the switch root parameters */ + *ret_switch_root_dir = m->switch_root; + *ret_switch_root_init = m->switch_root_init; + m->switch_root = m->switch_root_init = NULL; + + return 0; + + case MANAGER_EXIT: + + if (MANAGER_IS_USER(m)) { + log_debug("Exit."); + + *ret_reexecute = false; + *ret_retval = m->return_value; + *ret_shutdown_verb = NULL; + *ret_fds = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return 0; + } + + _fallthrough_; + case MANAGER_REBOOT: + case MANAGER_POWEROFF: + case MANAGER_HALT: + case MANAGER_KEXEC: { + static const char * const table[_MANAGER_OBJECTIVE_MAX] = { + [MANAGER_EXIT] = "exit", + [MANAGER_REBOOT] = "reboot", + [MANAGER_POWEROFF] = "poweroff", + [MANAGER_HALT] = "halt", + [MANAGER_KEXEC] = "kexec", + }; + + log_notice("Shutting down."); + + *ret_reexecute = false; + *ret_retval = m->return_value; + assert_se(*ret_shutdown_verb = table[m->objective]); + *ret_fds = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return 0; + } + + default: + assert_not_reached("Unknown or unexpected manager objective."); + } + } +} + +static void log_execution_mode(bool *ret_first_boot) { + assert(ret_first_boot); + + if (arg_system) { + int v; + + log_info("systemd " GIT_VERSION " running in %ssystem mode. (" SYSTEMD_FEATURES ")", + arg_action == ACTION_TEST ? "test " : "" ); + + v = detect_virtualization(); + if (v > 0) + log_info("Detected virtualization %s.", virtualization_to_string(v)); + + log_info("Detected architecture %s.", architecture_to_string(uname_architecture())); + + if (in_initrd()) { + *ret_first_boot = false; + log_info("Running in initial RAM disk."); + } else { + /* Let's check whether we are in first boot, i.e. whether /etc is still unpopulated. We use + * /etc/machine-id as flag file, for this: if it exists we assume /etc is populated, if it + * doesn't it's unpopulated. This allows container managers and installers to provision a + * couple of files already. If the container manager wants to provision the machine ID itself + * it should pass $container_uuid to PID 1. */ + + *ret_first_boot = access("/etc/machine-id", F_OK) < 0; + if (*ret_first_boot) + log_info("Running with unpopulated /etc."); + } + } else { + if (DEBUG_LOGGING) { + _cleanup_free_ char *t; + + t = uid_to_name(getuid()); + log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (" SYSTEMD_FEATURES ")", + arg_action == ACTION_TEST ? " test" : "", getuid(), strna(t)); + } + + *ret_first_boot = false; + } +} + +static int initialize_runtime( + bool skip_setup, + struct rlimit *saved_rlimit_nofile, + struct rlimit *saved_rlimit_memlock, + const char **ret_error_message) { + + int r; + + assert(ret_error_message); + + /* Sets up various runtime parameters. Many of these initializations are conditionalized: + * + * - Some only apply to --system instances + * - Some only apply to --user instances + * - Some only apply when we first start up, but not when we reexecute + */ + + if (arg_action != ACTION_RUN) + return 0; + + if (arg_system) { + /* Make sure we leave a core dump without panicing the kernel. */ + install_crash_handler(); + + if (!skip_setup) { + r = mount_cgroup_controllers(); + if (r < 0) { + *ret_error_message = "Failed to mount cgroup hierarchies"; + return r; + } + + status_welcome(); + hostname_setup(); + machine_id_setup(NULL, arg_machine_id, NULL); + loopback_setup(); + bump_unix_max_dgram_qlen(); + bump_file_max_and_nr_open(); + test_usr(); + write_container_id(); + } + + if (arg_watchdog_device) { + r = watchdog_set_device(arg_watchdog_device); + if (r < 0) + log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device); + } + + if (arg_runtime_watchdog > 0 && arg_runtime_watchdog != USEC_INFINITY) + watchdog_set_timeout(&arg_runtime_watchdog); + } + + if (arg_timer_slack_nsec != NSEC_INFINITY) + if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0) + log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m"); + + if (arg_system && !cap_test_all(arg_capability_bounding_set)) { + r = capability_bounding_set_drop_usermode(arg_capability_bounding_set); + if (r < 0) { + *ret_error_message = "Failed to drop capability bounding set of usermode helpers"; + return log_emergency_errno(r, "Failed to drop capability bounding set of usermode helpers: %m"); + } + + r = capability_bounding_set_drop(arg_capability_bounding_set, true); + if (r < 0) { + *ret_error_message = "Failed to drop capability bounding set"; + return log_emergency_errno(r, "Failed to drop capability bounding set: %m"); + } + } + + if (arg_system && arg_no_new_privs) { + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + *ret_error_message = "Failed to disable new privileges"; + return log_emergency_errno(errno, "Failed to disable new privileges: %m"); + } + } + + if (arg_syscall_archs) { + r = enforce_syscall_archs(arg_syscall_archs); + if (r < 0) { + *ret_error_message = "Failed to set syscall architectures"; + return r; + } + } + + if (!arg_system) + /* Become reaper of our children */ + if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) + log_warning_errno(errno, "Failed to make us a subreaper: %m"); + + /* Bump up RLIMIT_NOFILE for systemd itself */ + (void) bump_rlimit_nofile(saved_rlimit_nofile); + (void) bump_rlimit_memlock(saved_rlimit_memlock); + + return 0; +} + +static int do_queue_default_job( + Manager *m, + const char **ret_error_message) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Job *default_unit_job; + Unit *target = NULL; + int r; + + log_debug("Activating default unit: %s", arg_default_unit); + + r = manager_load_startable_unit_or_warn(m, arg_default_unit, NULL, &target); + if (r < 0) { + log_info("Falling back to rescue target: " SPECIAL_RESCUE_TARGET); + + r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target); + if (r < 0) { + *ret_error_message = r == -ERFKILL ? "Rescue target masked" + : "Failed to load rescue target"; + return r; + } + } + + assert(target->load_state == UNIT_LOADED); + + r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, &error, &default_unit_job); + if (r == -EPERM) { + log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r)); + + sd_bus_error_free(&error); + + r = manager_add_job(m, JOB_START, target, JOB_REPLACE, &error, &default_unit_job); + if (r < 0) { + *ret_error_message = "Failed to start default target"; + return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r)); + } + + } else if (r < 0) { + *ret_error_message = "Failed to isolate default target"; + return log_emergency_errno(r, "Failed to isolate default target: %s", bus_error_message(&error, r)); + } + + m->default_unit_job_id = default_unit_job->id; + + return 0; +} + +static void free_arguments(void) { + + /* Frees all arg_* variables, with the exception of arg_serialization */ + rlimit_free_all(arg_default_rlimit); + + arg_default_unit = mfree(arg_default_unit); + arg_confirm_spawn = mfree(arg_confirm_spawn); + arg_default_environment = strv_free(arg_default_environment); + arg_syscall_archs = set_free(arg_syscall_archs); +} + +static int load_configuration(int argc, char **argv, const char **ret_error_message) { + int r; + + assert(ret_error_message); + + arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U); + + r = parse_config_file(); + if (r < 0) { + *ret_error_message = "Failed to parse config file"; + return r; + } + + if (arg_system) { + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + } + + /* Note that this also parses bits from the kernel command line, including "debug". */ + log_parse_environment(); + + r = parse_argv(argc, argv); + if (r < 0) { + *ret_error_message = "Failed to parse commandline arguments"; + return r; + } + + /* Initialize default unit */ + if (!arg_default_unit) { + arg_default_unit = strdup(SPECIAL_DEFAULT_TARGET); + if (!arg_default_unit) { + *ret_error_message = "Failed to set default unit"; + return log_oom(); + } + } + + /* Initialize the show status setting if it hasn't been set explicitly yet */ + if (arg_show_status == _SHOW_STATUS_INVALID) + arg_show_status = SHOW_STATUS_YES; + + return 0; +} + +static int safety_checks(void) { + + if (getpid_cached() == 1 && + arg_action != ACTION_RUN) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Unsupported execution mode while PID 1."); + + if (getpid_cached() == 1 && + !arg_system) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run --user mode as PID 1."); + + if (arg_action == ACTION_RUN && + arg_system && + getpid_cached() != 1) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run system mode unless PID 1."); + + if (arg_action == ACTION_TEST && + geteuid() == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Don't run test mode as root."); + + if (!arg_system && + arg_action == ACTION_RUN && + sd_booted() <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Trying to run as user instance, but the system has not been booted with systemd."); + + if (!arg_system && + arg_action == ACTION_RUN && + !getenv("XDG_RUNTIME_DIR")) + return log_error_errno(SYNTHETIC_ERRNO(EUNATCH), + "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set."); + + if (arg_system && + arg_action == ACTION_RUN && + running_in_chroot() > 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cannot be run in a chroot() environment."); + + return 0; +} + +static int initialize_security( + bool *loaded_policy, + dual_timestamp *security_start_timestamp, + dual_timestamp *security_finish_timestamp, + const char **ret_error_message) { + + int r; + + assert(loaded_policy); + assert(security_start_timestamp); + assert(security_finish_timestamp); + assert(ret_error_message); + + dual_timestamp_get(security_start_timestamp); + + r = mac_selinux_setup(loaded_policy); + if (r < 0) { + *ret_error_message = "Failed to load SELinux policy"; + return r; + } + + r = mac_smack_setup(loaded_policy); + if (r < 0) { + *ret_error_message = "Failed to load SMACK policy"; + return r; + } + + r = ima_setup(); + if (r < 0) { + *ret_error_message = "Failed to load IMA policy"; + return r; + } + + dual_timestamp_get(security_finish_timestamp); + return 0; +} + +static void test_summary(Manager *m) { + assert(m); + + printf("-> By units:\n"); + manager_dump_units(m, stdout, "\t"); + + printf("-> By jobs:\n"); + manager_dump_jobs(m, stdout, "\t"); +} + +static int collect_fds(FDSet **ret_fds, const char **ret_error_message) { + int r; + + assert(ret_fds); + assert(ret_error_message); + + r = fdset_new_fill(ret_fds); + if (r < 0) { + *ret_error_message = "Failed to allocate fd set"; + return log_emergency_errno(r, "Failed to allocate fd set: %m"); + } + + fdset_cloexec(*ret_fds, true); + + if (arg_serialization) + assert_se(fdset_remove(*ret_fds, fileno(arg_serialization)) >= 0); + + return 0; +} + +static void setup_console_terminal(bool skip_setup) { + + if (!arg_system) + return; + + /* Become a session leader if we aren't one yet. */ + (void) setsid(); + + /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a controlling + * tty. */ + (void) release_terminal(); + + /* Reset the console, but only if this is really init and we are freshly booted */ + if (getpid_cached() == 1 && !skip_setup) + (void) console_setup(); +} + +static bool early_skip_setup_check(int argc, char *argv[]) { + bool found_deserialize = false; + int i; + + /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much later, so + * let's just have a quick peek here. Note that if we have switched root, do all the special setup things + * anyway, even if in that case we also do deserialization. */ + + for (i = 1; i < argc; i++) { + if (streq(argv[i], "--switched-root")) + return false; /* If we switched root, don't skip the setup. */ + else if (streq(argv[i], "--deserialize")) + found_deserialize = true; + } + + return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */ +} + +int main(int argc, char *argv[]) { + + dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL, + security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL; + struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0), + saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed + * in. Note we use different values + * for the two that indicate whether + * these fields are initialized! */ + bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false; + char *switch_root_dir = NULL, *switch_root_init = NULL; + usec_t before_startup, after_startup; + static char systemd[] = "systemd"; + char timespan[FORMAT_TIMESPAN_MAX]; + const char *shutdown_verb = NULL, *error_message = NULL; + int r, retval = EXIT_FAILURE; + Manager *m = NULL; + FDSet *fds = NULL; + + /* SysV compatibility: redirect init → telinit */ + redirect_telinit(argc, argv); + + /* Take timestamps early on */ + dual_timestamp_from_monotonic(&kernel_timestamp, 0); + dual_timestamp_get(&userspace_timestamp); + + /* Figure out whether we need to do initialize the system, or if we already did that because we are + * reexecuting */ + skip_setup = early_skip_setup_check(argc, argv); + + /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent reexecution we + * are then called 'systemd'. That is confusing, hence let's call us systemd right-away. */ + program_invocation_short_name = systemd; + (void) prctl(PR_SET_NAME, systemd); + + /* Save the original command line */ + saved_argv = argv; + saved_argc = argc; + + /* Make sure that if the user says "syslog" we actually log to the journal. */ + log_set_upgrade_syslog_to_journal(true); + + if (getpid_cached() == 1) { + /* When we run as PID 1 force system mode */ + arg_system = true; + + /* Disable the umask logic */ + umask(0); + + /* Make sure that at least initially we do not ever log to journald/syslogd, because it might not be + * activated yet (even though the log socket for it exists). */ + log_set_prohibit_ipc(true); + + /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This is + * important so that we never end up logging to any foreign stderr, for example if we have to log in a + * child process right before execve()'ing the actual binary, at a point in time where socket + * activation stderr/stdout area already set up. */ + log_set_always_reopen_console(true); + + if (detect_container() <= 0) { + + /* Running outside of a container as PID 1 */ + log_set_target(LOG_TARGET_KMSG); + log_open(); + + if (in_initrd()) + initrd_timestamp = userspace_timestamp; + + if (!skip_setup) { + r = mount_setup_early(); + if (r < 0) { + error_message = "Failed to mount early API filesystems"; + goto finish; + } + + r = initialize_security( + &loaded_policy, + &security_start_timestamp, + &security_finish_timestamp, + &error_message); + if (r < 0) + goto finish; + } + + if (mac_selinux_init() < 0) { + error_message = "Failed to initialize SELinux policy"; + goto finish; + } + + if (!skip_setup) + initialize_clock(); + + /* Set the default for later on, but don't actually open the logs like this for now. Note that + * if we are transitioning from the initrd there might still be journal fd open, and we + * shouldn't attempt opening that before we parsed /proc/cmdline which might redirect output + * elsewhere. */ + log_set_target(LOG_TARGET_JOURNAL_OR_KMSG); + + } else { + /* Running inside a container, as PID 1 */ + log_set_target(LOG_TARGET_CONSOLE); + log_open(); + + /* For later on, see above... */ + log_set_target(LOG_TARGET_JOURNAL); + + /* clear the kernel timestamp, + * because we are in a container */ + kernel_timestamp = DUAL_TIMESTAMP_NULL; + } + + initialize_coredump(skip_setup); + + r = fixup_environment(); + if (r < 0) { + log_emergency_errno(r, "Failed to fix up PID 1 environment: %m"); + error_message = "Failed to fix up PID1 environment"; + goto finish; + } + + } else { + /* Running as user instance */ + arg_system = false; + log_set_target(LOG_TARGET_AUTO); + log_open(); + + /* clear the kernel timestamp, + * because we are not PID 1 */ + kernel_timestamp = DUAL_TIMESTAMP_NULL; + } + + if (arg_system) { + /* Try to figure out if we can use colors with the console. No need to do that for user instances since + * they never log into the console. */ + log_show_color(colors_enabled()); + + r = make_null_stdio(); + if (r < 0) + log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m"); + } + + /* Mount /proc, /sys and friends, so that /proc/cmdline and + * /proc/$PID/fd is available. */ + if (getpid_cached() == 1) { + + /* Load the kernel modules early. */ + if (!skip_setup) + kmod_setup(); + + r = mount_setup(loaded_policy); + if (r < 0) { + error_message = "Failed to mount API filesystems"; + goto finish; + } + } + + /* Reset all signal handlers. */ + (void) reset_all_signal_handlers(); + (void) ignore_signals(SIGNALS_IGNORE, -1); + + r = load_configuration(argc, argv, &error_message); + if (r < 0) + goto finish; + + r = safety_checks(); + if (r < 0) + goto finish; + + if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES)) + (void) pager_open(arg_pager_flags); + + if (arg_action != ACTION_RUN) + skip_setup = true; + + if (arg_action == ACTION_HELP) { + retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_VERSION) { + retval = version(); + goto finish; + } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) { + unit_dump_config_items(stdout); + retval = EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) { + dump_bus_properties(stdout); + retval = EXIT_SUCCESS; + goto finish; + } + + assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST)); + + /* Move out of the way, so that we won't block unmounts */ + assert_se(chdir("/") == 0); + + if (arg_action == ACTION_RUN) { + + /* A core pattern might have been specified via the cmdline. */ + initialize_core_pattern(skip_setup); + + /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */ + log_close(); + + /* Remember open file descriptors for later deserialization */ + r = collect_fds(&fds, &error_message); + if (r < 0) + goto finish; + + /* Give up any control of the console, but make sure its initialized. */ + setup_console_terminal(skip_setup); + + /* Open the logging devices, if possible and necessary */ + log_open(); + } + + log_execution_mode(&first_boot); + + r = initialize_runtime(skip_setup, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + &error_message); + if (r < 0) + goto finish; + + r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER, + arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0, + &m); + if (r < 0) { + log_emergency_errno(r, "Failed to allocate manager object: %m"); + error_message = "Failed to allocate manager object"; + goto finish; + } + + m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp; + m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp; + m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp; + + set_manager_defaults(m); + set_manager_settings(m); + manager_set_first_boot(m, first_boot); + + /* Remember whether we should queue the default job */ + queue_default_job = !arg_serialization || arg_switched_root; + + before_startup = now(CLOCK_MONOTONIC); + + r = manager_startup(m, arg_serialization, fds); + if (r < 0) { + error_message = "Failed to start up manager"; + goto finish; + } + + /* This will close all file descriptors that were opened, but not claimed by any unit. */ + fds = fdset_free(fds); + arg_serialization = safe_fclose(arg_serialization); + + if (queue_default_job) { + r = do_queue_default_job(m, &error_message); + if (r < 0) + goto finish; + } + + after_startup = now(CLOCK_MONOTONIC); + + log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG, + "Loaded units and determined initial transaction in %s.", + format_timespan(timespan, sizeof(timespan), after_startup - before_startup, 100 * USEC_PER_MSEC)); + + if (arg_action == ACTION_TEST) { + test_summary(m); + retval = EXIT_SUCCESS; + goto finish; + } + + (void) invoke_main_loop(m, + &reexecute, + &retval, + &shutdown_verb, + &fds, + &switch_root_dir, + &switch_root_init, + &error_message); + +finish: + pager_close(); + + if (m) { + arg_shutdown_watchdog = m->shutdown_watchdog; + m = manager_free(m); + } + + free_arguments(); + mac_selinux_finish(); + + if (reexecute) + do_reexecute(argc, argv, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + fds, + switch_root_dir, + switch_root_init, + &error_message); /* This only returns if reexecution failed */ + + arg_serialization = safe_fclose(arg_serialization); + fds = fdset_free(fds); + +#if HAVE_VALGRIND_VALGRIND_H + /* If we are PID 1 and running under valgrind, then let's exit + * here explicitly. valgrind will only generate nice output on + * exit(), not on exec(), hence let's do the former not the + * latter here. */ + if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) { + /* Cleanup watchdog_device strings for valgrind. We need them + * in become_shutdown() so normally we cannot free them yet. */ + watchdog_free_device(); + arg_watchdog_device = mfree(arg_watchdog_device); + return retval; + } +#endif + +#if HAS_FEATURE_ADDRESS_SANITIZER + __lsan_do_leak_check(); +#endif + + if (shutdown_verb) { + r = become_shutdown(shutdown_verb, retval); + log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting"); + error_message = "Failed to execute shutdown binary"; + } + + watchdog_free_device(); + arg_watchdog_device = mfree(arg_watchdog_device); + + if (getpid_cached() == 1) { + if (error_message) + manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL, + "%s.", error_message); + freeze_or_exit_or_reboot(); + } + + return retval; +} diff --git a/src/core/manager.c b/src/core/manager.c new file mode 100644 index 0000000..6086531 --- /dev/null +++ b/src/core/manager.c @@ -0,0 +1,4678 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <fcntl.h> +#include <linux/kd.h> +#include <signal.h> +#include <stdio_ext.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/inotify.h> +#include <sys/ioctl.h> +#include <sys/reboot.h> +#include <sys/timerfd.h> +#include <sys/wait.h> +#include <unistd.h> + +#if HAVE_AUDIT +#include <libaudit.h> +#endif + +#include "sd-daemon.h" +#include "sd-messages.h" +#include "sd-path.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "audit-fd.h" +#include "boot-timestamps.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-util.h" +#include "clean-ipc.h" +#include "clock-util.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "dirent-util.h" +#include "env-util.h" +#include "escape.h" +#include "exec-util.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hashmap.h" +#include "io-util.h" +#include "label.h" +#include "locale-setup.h" +#include "log.h" +#include "macro.h" +#include "manager.h" +#include "missing.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "process-util.h" +#include "ratelimit.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "serialize.h" +#include "signal-util.h" +#include "socket-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "time-util.h" +#include "transaction.h" +#include "umask-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "util.h" +#include "virt.h" +#include "watchdog.h" + +#define NOTIFY_RCVBUF_SIZE (8*1024*1024) +#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) + +/* Initial delay and the interval for printing status messages about running jobs */ +#define JOBS_IN_PROGRESS_WAIT_USEC (5*USEC_PER_SEC) +#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3) +#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 + +/* If there are more than 1K bus messages queue across our API and direct busses, then let's not add more on top until + * the queue gets more empty. */ +#define MANAGER_BUS_BUSY_THRESHOLD 1024LU + +/* How many units and jobs to process of the bus queue before returning to the event loop. */ +#define MANAGER_BUS_MESSAGE_BUDGET 100U + +static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); +static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); +static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); +static int manager_dispatch_timezone_change(sd_event_source *source, const struct inotify_event *event, void *userdata); +static int manager_run_environment_generators(Manager *m); +static int manager_run_generators(Manager *m); + +static void manager_watch_jobs_in_progress(Manager *m) { + usec_t next; + int r; + + assert(m); + + /* We do not want to show the cylon animation if the user + * needs to confirm service executions otherwise confirmation + * messages will be screwed by the cylon animation. */ + if (!manager_is_confirm_spawn_disabled(m)) + return; + + if (m->jobs_in_progress_event_source) + return; + + next = now(CLOCK_MONOTONIC) + JOBS_IN_PROGRESS_WAIT_USEC; + r = sd_event_add_time( + m->event, + &m->jobs_in_progress_event_source, + CLOCK_MONOTONIC, + next, 0, + manager_dispatch_jobs_in_progress, m); + if (r < 0) + return; + + (void) sd_event_source_set_description(m->jobs_in_progress_event_source, "manager-jobs-in-progress"); +} + +#define CYLON_BUFFER_EXTRA (2*STRLEN(ANSI_RED) + STRLEN(ANSI_HIGHLIGHT_RED) + 2*STRLEN(ANSI_NORMAL)) + +static void draw_cylon(char buffer[], size_t buflen, unsigned width, unsigned pos) { + char *p = buffer; + + assert(buflen >= CYLON_BUFFER_EXTRA + width + 1); + assert(pos <= width+1); /* 0 or width+1 mean that the center light is behind the corner */ + + if (pos > 1) { + if (pos > 2) + p = mempset(p, ' ', pos-2); + if (log_get_show_color()) + p = stpcpy(p, ANSI_RED); + *p++ = '*'; + } + + if (pos > 0 && pos <= width) { + if (log_get_show_color()) + p = stpcpy(p, ANSI_HIGHLIGHT_RED); + *p++ = '*'; + } + + if (log_get_show_color()) + p = stpcpy(p, ANSI_NORMAL); + + if (pos < width) { + if (log_get_show_color()) + p = stpcpy(p, ANSI_RED); + *p++ = '*'; + if (pos < width-1) + p = mempset(p, ' ', width-1-pos); + if (log_get_show_color()) + strcpy(p, ANSI_NORMAL); + } +} + +void manager_flip_auto_status(Manager *m, bool enable) { + assert(m); + + if (enable) { + if (m->show_status == SHOW_STATUS_AUTO) + manager_set_show_status(m, SHOW_STATUS_TEMPORARY); + } else { + if (m->show_status == SHOW_STATUS_TEMPORARY) + manager_set_show_status(m, SHOW_STATUS_AUTO); + } +} + +static void manager_print_jobs_in_progress(Manager *m) { + _cleanup_free_ char *job_of_n = NULL; + Iterator i; + Job *j; + unsigned counter = 0, print_nr; + char cylon[6 + CYLON_BUFFER_EXTRA + 1]; + unsigned cylon_pos; + char time[FORMAT_TIMESPAN_MAX], limit[FORMAT_TIMESPAN_MAX] = "no limit"; + uint64_t x; + + assert(m); + assert(m->n_running_jobs > 0); + + manager_flip_auto_status(m, true); + + print_nr = (m->jobs_in_progress_iteration / JOBS_IN_PROGRESS_PERIOD_DIVISOR) % m->n_running_jobs; + + HASHMAP_FOREACH(j, m->jobs, i) + if (j->state == JOB_RUNNING && counter++ == print_nr) + break; + + /* m->n_running_jobs must be consistent with the contents of m->jobs, + * so the above loop must have succeeded in finding j. */ + assert(counter == print_nr + 1); + assert(j); + + cylon_pos = m->jobs_in_progress_iteration % 14; + if (cylon_pos >= 8) + cylon_pos = 14 - cylon_pos; + draw_cylon(cylon, sizeof(cylon), 6, cylon_pos); + + m->jobs_in_progress_iteration++; + + if (m->n_running_jobs > 1) { + if (asprintf(&job_of_n, "(%u of %u) ", counter, m->n_running_jobs) < 0) + job_of_n = NULL; + } + + format_timespan(time, sizeof(time), now(CLOCK_MONOTONIC) - j->begin_usec, 1*USEC_PER_SEC); + if (job_get_timeout(j, &x) > 0) + format_timespan(limit, sizeof(limit), x - j->begin_usec, 1*USEC_PER_SEC); + + manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon, + "%sA %s job is running for %s (%s / %s)", + strempty(job_of_n), + job_type_to_string(j->type), + unit_description(j->unit), + time, limit); +} + +static int have_ask_password(void) { + _cleanup_closedir_ DIR *dir; + struct dirent *de; + + dir = opendir("/run/systemd/ask-password"); + if (!dir) { + if (errno == ENOENT) + return false; + else + return -errno; + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + if (startswith(de->d_name, "ask.")) + return true; + } + return false; +} + +static int manager_dispatch_ask_password_fd(sd_event_source *source, + int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(m); + + (void) flush_fd(fd); + + m->have_ask_password = have_ask_password(); + if (m->have_ask_password < 0) + /* Log error but continue. Negative have_ask_password + * is treated as unknown status. */ + log_error_errno(m->have_ask_password, "Failed to list /run/systemd/ask-password: %m"); + + return 0; +} + +static void manager_close_ask_password(Manager *m) { + assert(m); + + m->ask_password_event_source = sd_event_source_unref(m->ask_password_event_source); + m->ask_password_inotify_fd = safe_close(m->ask_password_inotify_fd); + m->have_ask_password = -EINVAL; +} + +static int manager_check_ask_password(Manager *m) { + int r; + + assert(m); + + if (!m->ask_password_event_source) { + assert(m->ask_password_inotify_fd < 0); + + mkdir_p_label("/run/systemd/ask-password", 0755); + + m->ask_password_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->ask_password_inotify_fd < 0) + return log_error_errno(errno, "inotify_init1() failed: %m"); + + if (inotify_add_watch(m->ask_password_inotify_fd, "/run/systemd/ask-password", IN_CREATE|IN_DELETE|IN_MOVE) < 0) { + log_error_errno(errno, "Failed to add watch on /run/systemd/ask-password: %m"); + manager_close_ask_password(m); + return -errno; + } + + r = sd_event_add_io(m->event, &m->ask_password_event_source, + m->ask_password_inotify_fd, EPOLLIN, + manager_dispatch_ask_password_fd, m); + if (r < 0) { + log_error_errno(errno, "Failed to add event source for /run/systemd/ask-password: %m"); + manager_close_ask_password(m); + return -errno; + } + + (void) sd_event_source_set_description(m->ask_password_event_source, "manager-ask-password"); + + /* Queries might have been added meanwhile... */ + manager_dispatch_ask_password_fd(m->ask_password_event_source, + m->ask_password_inotify_fd, EPOLLIN, m); + } + + return m->have_ask_password; +} + +static int manager_watch_idle_pipe(Manager *m) { + int r; + + assert(m); + + if (m->idle_pipe_event_source) + return 0; + + if (m->idle_pipe[2] < 0) + return 0; + + r = sd_event_add_io(m->event, &m->idle_pipe_event_source, m->idle_pipe[2], EPOLLIN, manager_dispatch_idle_pipe_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to watch idle pipe: %m"); + + (void) sd_event_source_set_description(m->idle_pipe_event_source, "manager-idle-pipe"); + + return 0; +} + +static void manager_close_idle_pipe(Manager *m) { + assert(m); + + m->idle_pipe_event_source = sd_event_source_unref(m->idle_pipe_event_source); + + safe_close_pair(m->idle_pipe); + safe_close_pair(m->idle_pipe + 2); +} + +static int manager_setup_time_change(Manager *m) { + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + m->time_change_event_source = sd_event_source_unref(m->time_change_event_source); + m->time_change_fd = safe_close(m->time_change_fd); + + m->time_change_fd = time_change_fd(); + if (m->time_change_fd < 0) + return log_error_errno(m->time_change_fd, "Failed to create timer change timer fd: %m"); + + r = sd_event_add_io(m->event, &m->time_change_event_source, m->time_change_fd, EPOLLIN, manager_dispatch_time_change_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to create time change event source: %m"); + + /* Schedule this slightly earlier than the .timer event sources */ + r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1); + if (r < 0) + return log_error_errno(r, "Failed to set priority of time change event sources: %m"); + + (void) sd_event_source_set_description(m->time_change_event_source, "manager-time-change"); + + log_debug("Set up TFD_TIMER_CANCEL_ON_SET timerfd."); + + return 0; +} + +static int manager_read_timezone_stat(Manager *m) { + struct stat st; + bool changed; + + assert(m); + + /* Read the current stat() data of /etc/localtime so that we detect changes */ + if (lstat("/etc/localtime", &st) < 0) { + log_debug_errno(errno, "Failed to stat /etc/localtime, ignoring: %m"); + changed = m->etc_localtime_accessible; + m->etc_localtime_accessible = false; + } else { + usec_t k; + + k = timespec_load(&st.st_mtim); + changed = !m->etc_localtime_accessible || k != m->etc_localtime_mtime; + + m->etc_localtime_mtime = k; + m->etc_localtime_accessible = true; + } + + return changed; +} + +static int manager_setup_timezone_change(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *new_event = NULL; + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + /* We watch /etc/localtime for three events: change of the link count (which might mean removal from /etc even + * though another link might be kept), renames, and file close operations after writing. Note we don't bother + * with IN_DELETE_SELF, as that would just report when the inode is removed entirely, i.e. after the link count + * went to zero and all fds to it are closed. + * + * Note that we never follow symlinks here. This is a simplification, but should cover almost all cases + * correctly. + * + * Note that we create the new event source first here, before releasing the old one. This should optimize + * behaviour as this way sd-event can reuse the old watch in case the inode didn't change. */ + + r = sd_event_add_inotify(m->event, &new_event, "/etc/localtime", + IN_ATTRIB|IN_MOVE_SELF|IN_CLOSE_WRITE|IN_DONT_FOLLOW, manager_dispatch_timezone_change, m); + if (r == -ENOENT) { + /* If the file doesn't exist yet, subscribe to /etc instead, and wait until it is created either by + * O_CREATE or by rename() */ + + log_debug_errno(r, "/etc/localtime doesn't exist yet, watching /etc instead."); + r = sd_event_add_inotify(m->event, &new_event, "/etc", + IN_CREATE|IN_MOVED_TO|IN_ONLYDIR, manager_dispatch_timezone_change, m); + } + if (r < 0) + return log_error_errno(r, "Failed to create timezone change event source: %m"); + + /* Schedule this slightly earlier than the .timer event sources */ + r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1); + if (r < 0) + return log_error_errno(r, "Failed to set priority of timezone change event sources: %m"); + + sd_event_source_unref(m->timezone_change_event_source); + m->timezone_change_event_source = TAKE_PTR(new_event); + + return 0; +} + +static int enable_special_signals(Manager *m) { + _cleanup_close_ int fd = -1; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + /* Enable that we get SIGINT on control-alt-del. In containers + * this will fail with EPERM (older) or EINVAL (newer), so + * ignore that. */ + if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL)) + log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m"); + + fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) { + /* Support systems without virtual console */ + if (fd != -ENOENT) + log_warning_errno(errno, "Failed to open /dev/tty0: %m"); + } else { + /* Enable that we get SIGWINCH on kbrequest */ + if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0) + log_warning_errno(errno, "Failed to enable kbrequest handling: %m"); + } + + return 0; +} + +#define RTSIG_IF_AVAILABLE(signum) (signum <= SIGRTMAX ? signum : -1) + +static int manager_setup_signals(Manager *m) { + struct sigaction sa = { + .sa_handler = SIG_DFL, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + sigset_t mask; + int r; + + assert(m); + + assert_se(sigaction(SIGCHLD, &sa, NULL) == 0); + + /* We make liberal use of realtime signals here. On + * Linux/glibc we have 30 of them (with the exception of Linux + * on hppa, see below), between SIGRTMIN+0 ... SIGRTMIN+30 + * (aka SIGRTMAX). */ + + assert_se(sigemptyset(&mask) == 0); + sigset_add_many(&mask, + SIGCHLD, /* Child died */ + SIGTERM, /* Reexecute daemon */ + SIGHUP, /* Reload configuration */ + SIGUSR1, /* systemd/upstart: reconnect to D-Bus */ + SIGUSR2, /* systemd: dump status */ + SIGINT, /* Kernel sends us this on control-alt-del */ + SIGWINCH, /* Kernel sends us this on kbrequest (alt-arrowup) */ + SIGPWR, /* Some kernel drivers and upsd send us this on power failure */ + + SIGRTMIN+0, /* systemd: start default.target */ + SIGRTMIN+1, /* systemd: isolate rescue.target */ + SIGRTMIN+2, /* systemd: isolate emergency.target */ + SIGRTMIN+3, /* systemd: start halt.target */ + SIGRTMIN+4, /* systemd: start poweroff.target */ + SIGRTMIN+5, /* systemd: start reboot.target */ + SIGRTMIN+6, /* systemd: start kexec.target */ + + /* ... space for more special targets ... */ + + SIGRTMIN+13, /* systemd: Immediate halt */ + SIGRTMIN+14, /* systemd: Immediate poweroff */ + SIGRTMIN+15, /* systemd: Immediate reboot */ + SIGRTMIN+16, /* systemd: Immediate kexec */ + + /* ... space for more immediate system state changes ... */ + + SIGRTMIN+20, /* systemd: enable status messages */ + SIGRTMIN+21, /* systemd: disable status messages */ + SIGRTMIN+22, /* systemd: set log level to LOG_DEBUG */ + SIGRTMIN+23, /* systemd: set log level to LOG_INFO */ + SIGRTMIN+24, /* systemd: Immediate exit (--user only) */ + + /* .. one free signal here ... */ + + /* Apparently Linux on hppa had fewer RT signals until v3.18, + * SIGRTMAX was SIGRTMIN+25, and then SIGRTMIN was lowered, + * see commit v3.17-7614-g1f25df2eff. + * + * We cannot unconditionally make use of those signals here, + * so let's use a runtime check. Since these commands are + * accessible by different means and only really a safety + * net, the missing functionality on hppa shouldn't matter. + */ + + RTSIG_IF_AVAILABLE(SIGRTMIN+26), /* systemd: set log target to journal-or-kmsg */ + RTSIG_IF_AVAILABLE(SIGRTMIN+27), /* systemd: set log target to console */ + RTSIG_IF_AVAILABLE(SIGRTMIN+28), /* systemd: set log target to kmsg */ + RTSIG_IF_AVAILABLE(SIGRTMIN+29), /* systemd: set log target to syslog-or-kmsg (obsolete) */ + + /* ... one free signal here SIGRTMIN+30 ... */ + -1); + assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + + m->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC); + if (m->signal_fd < 0) + return -errno; + + r = sd_event_add_io(m->event, &m->signal_event_source, m->signal_fd, EPOLLIN, manager_dispatch_signal_fd, m); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->signal_event_source, "manager-signal"); + + /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the + * notify processing can still figure out to which process/service a message belongs, before we reap the + * process. Also, process this before handling cgroup notifications, so that we always collect child exit + * status information before detecting that there's no process in a cgroup. */ + r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(m)) + return enable_special_signals(m); + + return 0; +} + +static char** sanitize_environment(char **l) { + + /* Let's remove some environment variables that we need ourselves to communicate with our clients */ + strv_env_unset_many( + l, + "EXIT_CODE", + "EXIT_STATUS", + "INVOCATION_ID", + "JOURNAL_STREAM", + "LISTEN_FDNAMES", + "LISTEN_FDS", + "LISTEN_PID", + "MAINPID", + "MANAGERPID", + "NOTIFY_SOCKET", + "REMOTE_ADDR", + "REMOTE_PORT", + "SERVICE_RESULT", + "WATCHDOG_PID", + "WATCHDOG_USEC", + NULL); + + /* Let's order the environment alphabetically, just to make it pretty */ + strv_sort(l); + + return l; +} + +int manager_default_environment(Manager *m) { + assert(m); + + m->transient_environment = strv_free(m->transient_environment); + + if (MANAGER_IS_SYSTEM(m)) { + /* The system manager always starts with a clean + * environment for its children. It does not import + * the kernel's or the parents' exported variables. + * + * The initial passed environment is untouched to keep + * /proc/self/environ valid; it is used for tagging + * the init process inside containers. */ + m->transient_environment = strv_new("PATH=" DEFAULT_PATH); + + /* Import locale variables LC_*= from configuration */ + (void) locale_setup(&m->transient_environment); + } else + /* The user manager passes its own environment + * along to its children. */ + m->transient_environment = strv_copy(environ); + + if (!m->transient_environment) + return log_oom(); + + sanitize_environment(m->transient_environment); + + return 0; +} + +static int manager_setup_prefix(Manager *m) { + struct table_entry { + uint64_t type; + const char *suffix; + }; + + static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL }, + [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL }, + }; + + static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_CONFIGURATION, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_CONFIGURATION, "log" }, + [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL }, + }; + + const struct table_entry *p; + ExecDirectoryType i; + int r; + + assert(m); + + if (MANAGER_IS_SYSTEM(m)) + p = paths_system; + else + p = paths_user; + + for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) { + r = sd_path_home(p[i].type, p[i].suffix, &m->prefix[i]); + if (r < 0) + return r; + } + + return 0; +} + +static int manager_setup_run_queue(Manager *m) { + int r; + + assert(m); + assert(!m->run_queue_event_source); + + r = sd_event_add_defer(m->event, &m->run_queue_event_source, manager_dispatch_run_queue, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(m->run_queue_event_source, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->run_queue_event_source, "manager-run-queue"); + + return 0; +} + +static int manager_setup_sigchld_event_source(Manager *m) { + int r; + + assert(m); + assert(!m->sigchld_event_source); + + r = sd_event_add_defer(m->event, &m->sigchld_event_source, manager_dispatch_sigchld, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->sigchld_event_source, "manager-sigchld"); + + return 0; +} + +int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager **_m) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(_m); + assert(IN_SET(scope, UNIT_FILE_SYSTEM, UNIT_FILE_USER)); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .unit_file_scope = scope, + .objective = _MANAGER_OBJECTIVE_INVALID, + + .default_timer_accuracy_usec = USEC_PER_MINUTE, + .default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT, + .default_tasks_accounting = true, + .default_tasks_max = UINT64_MAX, + .default_timeout_start_usec = DEFAULT_TIMEOUT_USEC, + .default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC, + .default_restart_usec = DEFAULT_RESTART_USEC, + + .original_log_level = -1, + .original_log_target = _LOG_TARGET_INVALID, + + .notify_fd = -1, + .cgroups_agent_fd = -1, + .signal_fd = -1, + .time_change_fd = -1, + .user_lookup_fds = { -1, -1 }, + .private_listen_fd = -1, + .dev_autofs_fd = -1, + .cgroup_inotify_fd = -1, + .pin_cgroupfs_fd = -1, + .ask_password_inotify_fd = -1, + .idle_pipe = { -1, -1, -1, -1}, + + /* start as id #1, so that we can leave #0 around as "null-like" value */ + .current_job_id = 1, + + .have_ask_password = -EINVAL, /* we don't know */ + .first_boot = -1, + .test_run_flags = test_run_flags, + }; + +#if ENABLE_EFI + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) + boot_timestamps(m->timestamps + MANAGER_TIMESTAMP_USERSPACE, + m->timestamps + MANAGER_TIMESTAMP_FIRMWARE, + m->timestamps + MANAGER_TIMESTAMP_LOADER); +#endif + + /* Prepare log fields we can use for structured logging */ + if (MANAGER_IS_SYSTEM(m)) { + m->unit_log_field = "UNIT="; + m->unit_log_format_string = "UNIT=%s"; + + m->invocation_log_field = "INVOCATION_ID="; + m->invocation_log_format_string = "INVOCATION_ID=%s"; + } else { + m->unit_log_field = "USER_UNIT="; + m->unit_log_format_string = "USER_UNIT=%s"; + + m->invocation_log_field = "USER_INVOCATION_ID="; + m->invocation_log_format_string = "USER_INVOCATION_ID=%s"; + } + + /* Reboot immediately if the user hits C-A-D more often than 7x per 2s */ + RATELIMIT_INIT(m->ctrl_alt_del_ratelimit, 2 * USEC_PER_SEC, 7); + + r = manager_default_environment(m); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->units, &string_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->jobs, NULL); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->cgroup_unit, &path_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->watch_bus, &string_hash_ops); + if (r < 0) + return r; + + r = manager_setup_prefix(m); + if (r < 0) + return r; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + r = manager_setup_run_queue(m); + if (r < 0) + return r; + + if (test_run_flags == MANAGER_TEST_RUN_MINIMAL) { + m->cgroup_root = strdup(""); + if (!m->cgroup_root) + return -ENOMEM; + } else { + r = manager_setup_signals(m); + if (r < 0) + return r; + + r = manager_setup_cgroup(m); + if (r < 0) + return r; + + r = manager_setup_time_change(m); + if (r < 0) + return r; + + r = manager_read_timezone_stat(m); + if (r < 0) + return r; + + (void) manager_setup_timezone_change(m); + + r = manager_setup_sigchld_event_source(m); + if (r < 0) + return r; + } + + if (MANAGER_IS_SYSTEM(m) && test_run_flags == 0) { + r = mkdir_label("/run/systemd/units", 0755); + if (r < 0 && r != -EEXIST) + return r; + } + + m->taint_usr = + !in_initrd() && + dir_is_empty("/usr") > 0; + + /* Note that we do not set up the notify fd here. We do that after deserialization, + * since they might have gotten serialized across the reexec. */ + + *_m = TAKE_PTR(m); + + return 0; +} + +static int manager_setup_notify(Manager *m) { + int r; + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + if (m->notify_fd < 0) { + _cleanup_close_ int fd = -1; + union sockaddr_union sa = {}; + int salen; + + /* First free all secondary fields */ + m->notify_socket = mfree(m->notify_socket); + m->notify_event_source = sd_event_source_unref(m->notify_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate notification socket: %m"); + + fd_inc_rcvbuf(fd, NOTIFY_RCVBUF_SIZE); + + m->notify_socket = strappend(m->prefix[EXEC_DIRECTORY_RUNTIME], "/systemd/notify"); + if (!m->notify_socket) + return log_oom(); + + salen = sockaddr_un_set_path(&sa.un, m->notify_socket); + if (salen < 0) + return log_error_errno(salen, "Notify socket '%s' not valid for AF_UNIX socket address, refusing.", m->notify_socket); + + (void) mkdir_parents_label(m->notify_socket, 0755); + (void) sockaddr_un_unlink(&sa.un); + + r = bind(fd, &sa.sa, salen); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", m->notify_socket); + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + m->notify_fd = TAKE_FD(fd); + + log_debug("Using notification socket %s", m->notify_socket); + } + + if (!m->notify_event_source) { + r = sd_event_add_io(m->event, &m->notify_event_source, m->notify_fd, EPOLLIN, manager_dispatch_notify_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + + /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which + * service an exit message belongs. */ + r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) + return log_error_errno(r, "Failed to set priority of notify event source: %m"); + + (void) sd_event_source_set_description(m->notify_event_source, "manager-notify"); + } + + return 0; +} + +static int manager_setup_cgroups_agent(Manager *m) { + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/cgroups-agent", + }; + int r; + + /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering + * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and + * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on + * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number + * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, + * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX + * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and + * we thus won't lose messages. + * + * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen + * to it. The system instance hence listens on this special socket, but the user instances listen on the system + * bus for these messages. */ + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether unified cgroups hierarchy is used: %m"); + if (r > 0) /* We don't need this anymore on the unified hierarchy */ + return 0; + + if (m->cgroups_agent_fd < 0) { + _cleanup_close_ int fd = -1; + + /* First free all secondary fields */ + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); + + fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); + + (void) sockaddr_un_unlink(&sa.un); + + /* Only allow root to connect to this socket */ + RUN_WITH_UMASK(0077) + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + m->cgroups_agent_fd = TAKE_FD(fd); + } + + if (!m->cgroups_agent_event_source) { + r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); + + /* Process cgroups notifications early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification, + * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of + * cgroup inotify for the unified cgroup stuff. */ + r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-4); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); + + (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); + } + + return 0; +} + +static int manager_setup_user_lookup_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID + * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation, + * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked + * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects + * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes, + * hence we establish this communication channel so that forked off processes can pass their UID/GID + * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple + * datagram, along with their unit name, so that we can share one communication socket pair among all units for + * this purpose. + * + * You might wonder why we need a communication channel for this that is independent of the usual notification + * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET + * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user + * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available. + * + * Note that this function is called under two circumstances: when we first initialize (in which case we + * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload + * (in which case the socket pair already exists but we still need to allocate the event source for it). */ + + if (m->user_lookup_fds[0] < 0) { + + /* Free all secondary fields */ + safe_close_pair(m->user_lookup_fds); + m->user_lookup_event_source = sd_event_source_unref(m->user_lookup_event_source); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0) + return log_error_errno(errno, "Failed to allocate user lookup socket: %m"); + + (void) fd_inc_rcvbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE); + } + + if (!m->user_lookup_event_source) { + r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m); + if (r < 0) + return log_error_errno(errno, "Failed to allocate user lookup event source: %m"); + + /* Process even earlier than the notify event source, so that we always know first about valid UID/GID + * resolutions */ + r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11); + if (r < 0) + return log_error_errno(errno, "Failed to set priority ot user lookup event source: %m"); + + (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup"); + } + + return 0; +} + +static unsigned manager_dispatch_cleanup_queue(Manager *m) { + Unit *u; + unsigned n = 0; + + assert(m); + + while ((u = m->cleanup_queue)) { + assert(u->in_cleanup_queue); + + unit_free(u); + n++; + } + + return n; +} + +enum { + GC_OFFSET_IN_PATH, /* This one is on the path we were traveling */ + GC_OFFSET_UNSURE, /* No clue */ + GC_OFFSET_GOOD, /* We still need this unit */ + GC_OFFSET_BAD, /* We don't need this unit anymore */ + _GC_OFFSET_MAX +}; + +static void unit_gc_mark_good(Unit *u, unsigned gc_marker) { + Unit *other; + Iterator i; + void *v; + + u->gc_marker = gc_marker + GC_OFFSET_GOOD; + + /* Recursively mark referenced units as GOOD as well */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REFERENCES], i) + if (other->gc_marker == gc_marker + GC_OFFSET_UNSURE) + unit_gc_mark_good(other, gc_marker); +} + +static void unit_gc_sweep(Unit *u, unsigned gc_marker) { + Unit *other; + bool is_bad; + Iterator i; + void *v; + + assert(u); + + if (IN_SET(u->gc_marker - gc_marker, + GC_OFFSET_GOOD, GC_OFFSET_BAD, GC_OFFSET_UNSURE, GC_OFFSET_IN_PATH)) + return; + + if (u->in_cleanup_queue) + goto bad; + + if (!unit_may_gc(u)) + goto good; + + u->gc_marker = gc_marker + GC_OFFSET_IN_PATH; + + is_bad = true; + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REFERENCED_BY], i) { + unit_gc_sweep(other, gc_marker); + + if (other->gc_marker == gc_marker + GC_OFFSET_GOOD) + goto good; + + if (other->gc_marker != gc_marker + GC_OFFSET_BAD) + is_bad = false; + } + + if (u->refs_by_target) { + const UnitRef *ref; + + LIST_FOREACH(refs_by_target, ref, u->refs_by_target) { + unit_gc_sweep(ref->source, gc_marker); + + if (ref->source->gc_marker == gc_marker + GC_OFFSET_GOOD) + goto good; + + if (ref->source->gc_marker != gc_marker + GC_OFFSET_BAD) + is_bad = false; + } + } + + if (is_bad) + goto bad; + + /* We were unable to find anything out about this entry, so + * let's investigate it later */ + u->gc_marker = gc_marker + GC_OFFSET_UNSURE; + unit_add_to_gc_queue(u); + return; + +bad: + /* We definitely know that this one is not useful anymore, so + * let's mark it for deletion */ + u->gc_marker = gc_marker + GC_OFFSET_BAD; + unit_add_to_cleanup_queue(u); + return; + +good: + unit_gc_mark_good(u, gc_marker); +} + +static unsigned manager_dispatch_gc_unit_queue(Manager *m) { + unsigned n = 0, gc_marker; + Unit *u; + + assert(m); + + /* log_debug("Running GC..."); */ + + m->gc_marker += _GC_OFFSET_MAX; + if (m->gc_marker + _GC_OFFSET_MAX <= _GC_OFFSET_MAX) + m->gc_marker = 1; + + gc_marker = m->gc_marker; + + while ((u = m->gc_unit_queue)) { + assert(u->in_gc_queue); + + unit_gc_sweep(u, gc_marker); + + LIST_REMOVE(gc_queue, m->gc_unit_queue, u); + u->in_gc_queue = false; + + n++; + + if (IN_SET(u->gc_marker - gc_marker, + GC_OFFSET_BAD, GC_OFFSET_UNSURE)) { + if (u->id) + log_unit_debug(u, "Collecting."); + u->gc_marker = gc_marker + GC_OFFSET_BAD; + unit_add_to_cleanup_queue(u); + } + } + + return n; +} + +static unsigned manager_dispatch_gc_job_queue(Manager *m) { + unsigned n = 0; + Job *j; + + assert(m); + + while ((j = m->gc_job_queue)) { + assert(j->in_gc_queue); + + LIST_REMOVE(gc_queue, m->gc_job_queue, j); + j->in_gc_queue = false; + + n++; + + if (!job_may_gc(j)) + continue; + + log_unit_debug(j->unit, "Collecting job."); + (void) job_finish_and_invalidate(j, JOB_COLLECTED, false, false); + } + + return n; +} + +static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) { + unsigned n = 0; + Unit *u; + int r; + + assert(m); + + while ((u = m->stop_when_unneeded_queue)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + assert(m->stop_when_unneeded_queue); + + assert(u->in_stop_when_unneeded_queue); + LIST_REMOVE(stop_when_unneeded_queue, m->stop_when_unneeded_queue, u); + u->in_stop_when_unneeded_queue = false; + + n++; + + if (!unit_is_unneeded(u)) + continue; + + log_unit_debug(u, "Unit is not needed anymore."); + + /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the + * service being unnecessary after a while. */ + + if (!ratelimit_below(&u->auto_stop_ratelimit)) { + log_unit_warning(u, "Unit not needed anymore, but not stopping since we tried this too often recently."); + continue; + } + + /* Ok, nobody needs us anymore. Sniff. Then let's commit suicide */ + r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r)); + } + + return n; +} + +static void manager_clear_jobs_and_units(Manager *m) { + Unit *u; + + assert(m); + + while ((u = hashmap_first(m->units))) + unit_free(u); + + manager_dispatch_cleanup_queue(m); + + assert(!m->load_queue); + assert(!m->run_queue); + assert(!m->dbus_unit_queue); + assert(!m->dbus_job_queue); + assert(!m->cleanup_queue); + assert(!m->gc_unit_queue); + assert(!m->gc_job_queue); + assert(!m->stop_when_unneeded_queue); + + assert(hashmap_isempty(m->jobs)); + assert(hashmap_isempty(m->units)); + + m->n_on_console = 0; + m->n_running_jobs = 0; + m->n_installed_jobs = 0; + m->n_failed_jobs = 0; +} + +Manager* manager_free(Manager *m) { + ExecDirectoryType dt; + UnitType c; + + if (!m) + return NULL; + + manager_clear_jobs_and_units(m); + + for (c = 0; c < _UNIT_TYPE_MAX; c++) + if (unit_vtable[c]->shutdown) + unit_vtable[c]->shutdown(m); + + /* Keep the cgroup hierarchy in place except when we know we are going down for good */ + manager_shutdown_cgroup(m, IN_SET(m->objective, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)); + + lookup_paths_flush_generator(&m->lookup_paths); + + bus_done(m); + + exec_runtime_vacuum(m); + hashmap_free(m->exec_runtime_by_id); + + dynamic_user_vacuum(m, false); + hashmap_free(m->dynamic_users); + + hashmap_free(m->units); + hashmap_free(m->units_by_invocation_id); + hashmap_free(m->jobs); + hashmap_free(m->watch_pids); + hashmap_free(m->watch_bus); + + set_free(m->startup_units); + set_free(m->failed_units); + + sd_event_source_unref(m->signal_event_source); + sd_event_source_unref(m->sigchld_event_source); + sd_event_source_unref(m->notify_event_source); + sd_event_source_unref(m->cgroups_agent_event_source); + sd_event_source_unref(m->time_change_event_source); + sd_event_source_unref(m->timezone_change_event_source); + sd_event_source_unref(m->jobs_in_progress_event_source); + sd_event_source_unref(m->run_queue_event_source); + sd_event_source_unref(m->user_lookup_event_source); + sd_event_source_unref(m->sync_bus_names_event_source); + + safe_close(m->signal_fd); + safe_close(m->notify_fd); + safe_close(m->cgroups_agent_fd); + safe_close(m->time_change_fd); + safe_close_pair(m->user_lookup_fds); + + manager_close_ask_password(m); + + manager_close_idle_pipe(m); + + sd_event_unref(m->event); + + free(m->notify_socket); + + lookup_paths_free(&m->lookup_paths); + strv_free(m->transient_environment); + strv_free(m->client_environment); + + hashmap_free(m->cgroup_unit); + set_free_free(m->unit_path_cache); + + free(m->switch_root); + free(m->switch_root_init); + + rlimit_free_all(m->rlimit); + + assert(hashmap_isempty(m->units_requiring_mounts_for)); + hashmap_free(m->units_requiring_mounts_for); + + hashmap_free(m->uid_refs); + hashmap_free(m->gid_refs); + + for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) + m->prefix[dt] = mfree(m->prefix[dt]); + + return mfree(m); +} + +static void manager_enumerate_perpetual(Manager *m) { + UnitType c; + + assert(m); + + /* Let's ask every type to load all units from disk/kernel that it might know */ + for (c = 0; c < _UNIT_TYPE_MAX; c++) { + if (!unit_type_supported(c)) { + log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c)); + continue; + } + + if (unit_vtable[c]->enumerate_perpetual) + unit_vtable[c]->enumerate_perpetual(m); + } +} + +static void manager_enumerate(Manager *m) { + UnitType c; + + assert(m); + + /* Let's ask every type to load all units from disk/kernel that it might know */ + for (c = 0; c < _UNIT_TYPE_MAX; c++) { + if (!unit_type_supported(c)) { + log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c)); + continue; + } + + if (unit_vtable[c]->enumerate) + unit_vtable[c]->enumerate(m); + } + + manager_dispatch_load_queue(m); +} + +static void manager_coldplug(Manager *m) { + Iterator i; + Unit *u; + char *k; + int r; + + assert(m); + + log_debug("Invoking unit coldplug() handlers…"); + + /* Let's place the units back into their deserialized state */ + HASHMAP_FOREACH_KEY(u, k, m->units, i) { + + /* ignore aliases */ + if (u->id != k) + continue; + + r = unit_coldplug(u); + if (r < 0) + log_warning_errno(r, "We couldn't coldplug %s, proceeding anyway: %m", u->id); + } +} + +static void manager_catchup(Manager *m) { + Iterator i; + Unit *u; + char *k; + + assert(m); + + log_debug("Invoking unit catchup() handlers…"); + + /* Let's catch up on any state changes that happened while we were reloading/reexecing */ + HASHMAP_FOREACH_KEY(u, k, m->units, i) { + + /* ignore aliases */ + if (u->id != k) + continue; + + unit_catchup(u); + } +} + +static void manager_build_unit_path_cache(Manager *m) { + char **i; + int r; + + assert(m); + + set_free_free(m->unit_path_cache); + + m->unit_path_cache = set_new(&path_hash_ops); + if (!m->unit_path_cache) { + r = -ENOMEM; + goto fail; + } + + /* This simply builds a list of files we know exist, so that + * we don't always have to go to disk */ + + STRV_FOREACH(i, m->lookup_paths.search_path) { + _cleanup_closedir_ DIR *d = NULL; + struct dirent *de; + + d = opendir(*i); + if (!d) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open directory %s, ignoring: %m", *i); + continue; + } + + FOREACH_DIRENT(de, d, r = -errno; goto fail) { + char *p; + + p = strjoin(streq(*i, "/") ? "" : *i, "/", de->d_name); + if (!p) { + r = -ENOMEM; + goto fail; + } + + r = set_consume(m->unit_path_cache, p); + if (r < 0) + goto fail; + } + } + + return; + +fail: + log_warning_errno(r, "Failed to build unit path cache, proceeding without: %m"); + m->unit_path_cache = set_free_free(m->unit_path_cache); +} + +static void manager_distribute_fds(Manager *m, FDSet *fds) { + Iterator i; + Unit *u; + + assert(m); + + HASHMAP_FOREACH(u, m->units, i) { + + if (fdset_size(fds) <= 0) + break; + + if (!UNIT_VTABLE(u)->distribute_fds) + continue; + + UNIT_VTABLE(u)->distribute_fds(u, fds); + } +} + +static bool manager_dbus_is_running(Manager *m, bool deserialized) { + Unit *u; + + assert(m); + + /* This checks whether the dbus instance we are supposed to expose our APIs on is up. We check both the socket + * and the service unit. If the 'deserialized' parameter is true we'll check the deserialized state of the unit + * rather than the current one. */ + + if (MANAGER_IS_TEST_RUN(m)) + return false; + + u = manager_get_unit(m, SPECIAL_DBUS_SOCKET); + if (!u) + return false; + if ((deserialized ? SOCKET(u)->deserialized_state : SOCKET(u)->state) != SOCKET_RUNNING) + return false; + + u = manager_get_unit(m, SPECIAL_DBUS_SERVICE); + if (!u) + return false; + if (!IN_SET((deserialized ? SERVICE(u)->deserialized_state : SERVICE(u)->state), SERVICE_RUNNING, SERVICE_RELOAD)) + return false; + + return true; +} + +static void manager_setup_bus(Manager *m) { + assert(m); + + /* Let's set up our private bus connection now, unconditionally */ + (void) bus_init_private(m); + + /* If we are in --user mode also connect to the system bus now */ + if (MANAGER_IS_USER(m)) + (void) bus_init_system(m); + + /* Let's connect to the bus now, but only if the unit is supposed to be up */ + if (manager_dbus_is_running(m, MANAGER_IS_RELOADING(m))) { + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } +} + +static void manager_preset_all(Manager *m) { + int r; + + assert(m); + + if (m->first_boot <= 0) + return; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (MANAGER_IS_TEST_RUN(m)) + return; + + /* If this is the first boot, and we are in the host system, then preset everything */ + r = unit_file_preset_all(UNIT_FILE_SYSTEM, 0, NULL, UNIT_FILE_PRESET_ENABLE_ONLY, NULL, 0); + if (r < 0) + log_full_errno(r == -EEXIST ? LOG_NOTICE : LOG_WARNING, r, + "Failed to populate /etc with preset unit settings, ignoring: %m"); + else + log_info("Populated /etc with preset unit settings."); +} + +static void manager_vacuum(Manager *m) { + assert(m); + + /* Release any dynamic users no longer referenced */ + dynamic_user_vacuum(m, true); + + /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */ + manager_vacuum_uid_refs(m); + manager_vacuum_gid_refs(m); + + /* Release any runtimes no longer referenced */ + exec_runtime_vacuum(m); +} + +static void manager_ready(Manager *m) { + assert(m); + + /* After having loaded everything, do the final round of catching up with what might have changed */ + + m->objective = MANAGER_OK; /* Tell everyone we are up now */ + + /* It might be safe to log to the journal now and connect to dbus */ + manager_recheck_journal(m); + manager_recheck_dbus(m); + + /* Sync current state of bus names with our set of listening units */ + (void) manager_enqueue_sync_bus_names(m); + + /* Let's finally catch up with any changes that took place while we were reloading/reexecing */ + manager_catchup(m); +} + +static Manager* manager_reloading_start(Manager *m) { + m->n_reloading++; + return m; +} +static void manager_reloading_stopp(Manager **m) { + if (*m) { + assert((*m)->n_reloading > 0); + (*m)->n_reloading--; + } +} + +int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { + int r; + + assert(m); + + /* If we are running in test mode, we still want to run the generators, + * but we should not touch the real generator directories. */ + r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, + MANAGER_IS_TEST_RUN(m) ? LOOKUP_PATHS_TEMPORARY_GENERATED : 0, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to initialize path lookup table: %m"); + + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_START)); + r = manager_run_environment_generators(m); + if (r >= 0) + r = manager_run_generators(m); + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_FINISH)); + if (r < 0) + return r; + + manager_preset_all(m); + + r = lookup_paths_reduce(&m->lookup_paths); + if (r < 0) + log_warning_errno(r, "Failed ot reduce unit file paths, ignoring: %m"); + + manager_build_unit_path_cache(m); + + { + /* This block is (optionally) done with the reloading counter bumped */ + _cleanup_(manager_reloading_stopp) Manager *reloading = NULL; + + /* If we will deserialize make sure that during enumeration this is already known, so we increase the + * counter here already */ + if (serialization) + reloading = manager_reloading_start(m); + + /* First, enumerate what we can from all config files */ + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_START)); + manager_enumerate_perpetual(m); + manager_enumerate(m); + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_FINISH)); + + /* Second, deserialize if there is something to deserialize */ + if (serialization) { + r = manager_deserialize(m, serialization, fds); + if (r < 0) + return log_error_errno(r, "Deserialization failed: %m"); + } + + /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass + * some file descriptors to us pre-initialized. This enables socket-based activation of entire + * containers. */ + manager_distribute_fds(m, fds); + + /* We might have deserialized the notify fd, but if we didn't then let's create the bus now */ + r = manager_setup_notify(m); + if (r < 0) + /* No sense to continue without notifications, our children would fail anyway. */ + return r; + + r = manager_setup_cgroups_agent(m); + if (r < 0) + /* Likewise, no sense to continue without empty cgroup notifications. */ + return r; + + r = manager_setup_user_lookup_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + + /* Connect to the bus if we are good for it */ + manager_setup_bus(m); + + /* Now that we are connected to all possible busses, let's deserialize who is tracking us. */ + r = bus_track_coldplug(m, &m->subscribed, false, m->deserialized_subscribed); + if (r < 0) + log_warning_errno(r, "Failed to deserialized tracked clients, ignoring: %m"); + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + + /* Third, fire things up! */ + manager_coldplug(m); + + /* Clean up runtime objects */ + manager_vacuum(m); + + if (serialization) + /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the + * reload is finished */ + m->send_reloading_done = true; + } + + manager_ready(m); + + return 0; +} + +int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, sd_bus_error *e, Job **_ret) { + int r; + Transaction *tr; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(unit); + assert(mode < _JOB_MODE_MAX); + + if (mode == JOB_ISOLATE && type != JOB_START) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Isolate is only valid for start."); + + if (mode == JOB_ISOLATE && !unit->allow_isolate) + return sd_bus_error_setf(e, BUS_ERROR_NO_ISOLATION, "Operation refused, unit may not be isolated."); + + log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode)); + + type = job_type_collapse(type, unit); + + tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY); + if (!tr) + return -ENOMEM; + + r = transaction_add_job_and_dependencies(tr, type, unit, NULL, true, false, + IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS), + mode == JOB_IGNORE_DEPENDENCIES, e); + if (r < 0) + goto tr_abort; + + if (mode == JOB_ISOLATE) { + r = transaction_add_isolate_jobs(tr, m); + if (r < 0) + goto tr_abort; + } + + r = transaction_activate(tr, m, mode, e); + if (r < 0) + goto tr_abort; + + log_unit_debug(unit, + "Enqueued job %s/%s as %u", unit->id, + job_type_to_string(type), (unsigned) tr->anchor_job->id); + + if (_ret) + *_ret = tr->anchor_job; + + transaction_free(tr); + return 0; + +tr_abort: + transaction_abort(tr); + transaction_free(tr); + return r; +} + +int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, sd_bus_error *e, Job **ret) { + Unit *unit = NULL; /* just to appease gcc, initialization is not really necessary */ + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(name); + assert(mode < _JOB_MODE_MAX); + + r = manager_load_unit(m, name, NULL, NULL, &unit); + if (r < 0) + return r; + assert(unit); + + return manager_add_job(m, type, unit, mode, e, ret); +} + +int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Job **ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(name); + assert(mode < _JOB_MODE_MAX); + + r = manager_add_job_by_name(m, type, name, mode, &error, ret); + if (r < 0) + return log_warning_errno(r, "Failed to enqueue %s job for %s: %s", job_mode_to_string(mode), name, bus_error_message(&error, r)); + + return r; +} + +int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) { + int r; + Transaction *tr; + + assert(m); + assert(unit); + assert(mode < _JOB_MODE_MAX); + assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */ + + tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY); + if (!tr) + return -ENOMEM; + + /* We need an anchor job */ + r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, false, false, true, true, e); + if (r < 0) + goto tr_abort; + + /* Failure in adding individual dependencies is ignored, so this always succeeds. */ + transaction_add_propagate_reload_jobs(tr, unit, tr->anchor_job, mode == JOB_IGNORE_DEPENDENCIES, e); + + r = transaction_activate(tr, m, mode, e); + if (r < 0) + goto tr_abort; + + transaction_free(tr); + return 0; + +tr_abort: + transaction_abort(tr); + transaction_free(tr); + return r; +} + +Job *manager_get_job(Manager *m, uint32_t id) { + assert(m); + + return hashmap_get(m->jobs, UINT32_TO_PTR(id)); +} + +Unit *manager_get_unit(Manager *m, const char *name) { + assert(m); + assert(name); + + return hashmap_get(m->units, name); +} + +static int manager_dispatch_target_deps_queue(Manager *m) { + Unit *u; + unsigned k; + int r = 0; + + static const UnitDependency deps[] = { + UNIT_REQUIRED_BY, + UNIT_REQUISITE_OF, + UNIT_WANTED_BY, + UNIT_BOUND_BY + }; + + assert(m); + + while ((u = m->target_deps_queue)) { + assert(u->in_target_deps_queue); + + LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u); + u->in_target_deps_queue = false; + + for (k = 0; k < ELEMENTSOF(deps); k++) { + Unit *target; + Iterator i; + void *v; + + HASHMAP_FOREACH_KEY(v, target, u->dependencies[deps[k]], i) { + r = unit_add_default_target_dependency(u, target); + if (r < 0) + return r; + } + } + } + + return r; +} + +unsigned manager_dispatch_load_queue(Manager *m) { + Unit *u; + unsigned n = 0; + + assert(m); + + /* Make sure we are not run recursively */ + if (m->dispatching_load_queue) + return 0; + + m->dispatching_load_queue = true; + + /* Dispatches the load queue. Takes a unit from the queue and + * tries to load its data until the queue is empty */ + + while ((u = m->load_queue)) { + assert(u->in_load_queue); + + unit_load(u); + n++; + } + + m->dispatching_load_queue = false; + + /* Dispatch the units waiting for their target dependencies to be added now, as all targets that we know about + * should be loaded and have aliases resolved */ + (void) manager_dispatch_target_deps_queue(m); + + return n; +} + +int manager_load_unit_prepare( + Manager *m, + const char *name, + const char *path, + sd_bus_error *e, + Unit **_ret) { + + _cleanup_(unit_freep) Unit *cleanup_ret = NULL; + Unit *ret; + UnitType t; + int r; + + assert(m); + assert(name || path); + assert(_ret); + + /* This will prepare the unit for loading, but not actually + * load anything from disk. */ + + if (path && !is_path(path)) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + if (!name) + name = basename(path); + + t = unit_name_to_type(name); + + if (t == _UNIT_TYPE_INVALID || !unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is missing the instance name.", name); + + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is not valid.", name); + } + + ret = manager_get_unit(m, name); + if (ret) { + *_ret = ret; + return 1; + } + + ret = cleanup_ret = unit_new(m, unit_vtable[t]->object_size); + if (!ret) + return -ENOMEM; + + if (path) { + ret->fragment_path = strdup(path); + if (!ret->fragment_path) + return -ENOMEM; + } + + r = unit_add_name(ret, name); + if (r < 0) + return r; + + unit_add_to_load_queue(ret); + unit_add_to_dbus_queue(ret); + unit_add_to_gc_queue(ret); + + *_ret = ret; + cleanup_ret = NULL; + + return 0; +} + +int manager_load_unit( + Manager *m, + const char *name, + const char *path, + sd_bus_error *e, + Unit **_ret) { + + int r; + + assert(m); + assert(_ret); + + /* This will load the service information files, but not actually + * start any services or anything. */ + + r = manager_load_unit_prepare(m, name, path, e, _ret); + if (r != 0) + return r; + + manager_dispatch_load_queue(m); + + *_ret = unit_follow_merge(*_ret); + return 0; +} + +int manager_load_startable_unit_or_warn( + Manager *m, + const char *name, + const char *path, + Unit **ret) { + + /* Load a unit, make sure it loaded fully and is not masked. */ + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *unit; + int r; + + r = manager_load_unit(m, name, path, &error, &unit); + if (r < 0) + return log_error_errno(r, "Failed to load %s %s: %s", + name ? "unit" : "unit file", name ?: path, + bus_error_message(&error, r)); + + r = bus_unit_validate_load_state(unit, &error); + if (r < 0) + return log_error_errno(r, "%s", bus_error_message(&error, r)); + + *ret = unit; + return 0; +} + +void manager_dump_jobs(Manager *s, FILE *f, const char *prefix) { + Iterator i; + Job *j; + + assert(s); + assert(f); + + HASHMAP_FOREACH(j, s->jobs, i) + job_dump(j, f, prefix); +} + +void manager_dump_units(Manager *s, FILE *f, const char *prefix) { + Iterator i; + Unit *u; + const char *t; + + assert(s); + assert(f); + + HASHMAP_FOREACH_KEY(u, t, s->units, i) + if (u->id == t) + unit_dump(u, f, prefix); +} + +void manager_dump(Manager *m, FILE *f, const char *prefix) { + ManagerTimestamp q; + + assert(m); + assert(f); + + for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + char buf[FORMAT_TIMESTAMP_MAX]; + + if (dual_timestamp_is_set(m->timestamps + q)) + fprintf(f, "%sTimestamp %s: %s\n", + strempty(prefix), + manager_timestamp_to_string(q), + format_timestamp(buf, sizeof(buf), m->timestamps[q].realtime)); + } + + manager_dump_units(m, f, prefix); + manager_dump_jobs(m, f, prefix); +} + +int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size; + int r; + + assert(m); + assert(ret); + + f = open_memstream(&dump, &size); + if (!f) + return -errno; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + manager_dump(m, f, NULL); + + r = fflush_and_check(f); + if (r < 0) + return r; + + f = safe_fclose(f); + + *ret = TAKE_PTR(dump); + + return 0; +} + +void manager_clear_jobs(Manager *m) { + Job *j; + + assert(m); + + while ((j = hashmap_first(m->jobs))) + /* No need to recurse. We're cancelling all jobs. */ + job_finish_and_invalidate(j, JOB_CANCELED, false, false); +} + +static int manager_dispatch_run_queue(sd_event_source *source, void *userdata) { + Manager *m = userdata; + Job *j; + + assert(source); + assert(m); + + while ((j = m->run_queue)) { + assert(j->installed); + assert(j->in_run_queue); + + (void) job_run_and_invalidate(j); + } + + if (m->n_running_jobs > 0) + manager_watch_jobs_in_progress(m); + + if (m->n_on_console > 0) + manager_watch_idle_pipe(m); + + return 1; +} + +static unsigned manager_dispatch_dbus_queue(Manager *m) { + unsigned n = 0, budget; + Unit *u; + Job *j; + + assert(m); + + /* When we are reloading, let's not wait with generating signals, since we need to exit the manager as quickly + * as we can. There's no point in throttling generation of signals in that case. */ + if (MANAGER_IS_RELOADING(m) || m->send_reloading_done || m->pending_reload_message) + budget = (unsigned) -1; /* infinite budget in this case */ + else { + /* Anything to do at all? */ + if (!m->dbus_unit_queue && !m->dbus_job_queue) + return 0; + + /* Do we have overly many messages queued at the moment? If so, let's not enqueue more on top, let's + * sit this cycle out, and process things in a later cycle when the queues got a bit emptier. */ + if (manager_bus_n_queued_write(m) > MANAGER_BUS_BUSY_THRESHOLD) + return 0; + + /* Only process a certain number of units/jobs per event loop iteration. Even if the bus queue wasn't + * overly full before this call we shouldn't increase it in size too wildly in one step, and we + * shouldn't monopolize CPU time with generating these messages. Note the difference in counting of + * this "budget" and the "threshold" above: the "budget" is decreased only once per generated message, + * regardless how many busses/direct connections it is enqueued on, while the "threshold" is applied to + * each queued instance of bus message, i.e. if the same message is enqueued to five busses/direct + * connections it will be counted five times. This difference in counting ("references" + * vs. "instances") is primarily a result of the fact that it's easier to implement it this way, + * however it also reflects the thinking that the "threshold" should put a limit on used queue memory, + * i.e. space, while the "budget" should put a limit on time. Also note that the "threshold" is + * currently chosen much higher than the "budget". */ + budget = MANAGER_BUS_MESSAGE_BUDGET; + } + + while (budget != 0 && (u = m->dbus_unit_queue)) { + + assert(u->in_dbus_queue); + + bus_unit_send_change_signal(u); + n++; + + if (budget != (unsigned) -1) + budget--; + } + + while (budget != 0 && (j = m->dbus_job_queue)) { + assert(j->in_dbus_queue); + + bus_job_send_change_signal(j); + n++; + + if (budget != (unsigned) -1) + budget--; + } + + if (m->send_reloading_done) { + m->send_reloading_done = false; + bus_manager_send_reloading(m, false); + n++; + } + + if (m->pending_reload_message) { + bus_send_pending_reload_message(m); + n++; + } + + return n; +} + +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + char buf[PATH_MAX]; + ssize_t n; + + n = recv(fd, buf, sizeof(buf), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read cgroups agent message: %m"); + if (n == 0) { + log_error("Got zero-length cgroups agent message, ignoring."); + return 0; + } + if ((size_t) n >= sizeof(buf)) { + log_error("Got overly long cgroups agent message, ignoring."); + return 0; + } + + if (memchr(buf, 0, n)) { + log_error("Got cgroups agent message with embedded NUL byte, ignoring."); + return 0; + } + buf[n] = 0; + + manager_notify_cgroup_empty(m, buf); + (void) bus_forward_agent_released(m, buf); + + return 0; +} + +static void manager_invoke_notify_message( + Manager *m, + Unit *u, + const struct ucred *ucred, + const char *buf, + FDSet *fds) { + + assert(m); + assert(u); + assert(ucred); + assert(buf); + + if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */ + return; + u->notifygen = m->notifygen; + + if (UNIT_VTABLE(u)->notify_message) { + _cleanup_strv_free_ char **tags = NULL; + + tags = strv_split(buf, NEWLINE); + if (!tags) { + log_oom(); + return; + } + + UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds); + + } else if (DEBUG_LOGGING) { + _cleanup_free_ char *x = NULL, *y = NULL; + + x = ellipsize(buf, 20, 90); + if (x) + y = cescape(x); + + log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y)); + } +} + +static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + + _cleanup_fdset_free_ FDSet *fds = NULL; + Manager *m = userdata; + char buf[NOTIFY_BUFFER_MAX+1]; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + union { + struct cmsghdr cmsghdr; + uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)]; + } control = {}; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + struct cmsghdr *cmsg; + struct ucred *ucred = NULL; + _cleanup_free_ Unit **array_copy = NULL; + Unit *u1, *u2, **array; + int r, *fd_array = NULL; + size_t n_fds = 0; + bool found = false; + ssize_t n; + + assert(m); + assert(m->notify_fd == fd); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for notify fd."); + return 0; + } + + n = recvmsg(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (n < 0) { + if (IN_SET(errno, EAGAIN, EINTR)) + return 0; /* Spurious wakeup, try again */ + + /* If this is any other, real error, then let's stop processing this socket. This of course means we + * won't take notification messages anymore, but that's still better than busy looping around this: + * being woken up over and over again but being unable to actually read the message off the socket. */ + return log_error_errno(errno, "Failed to receive notification message: %m"); + } + + CMSG_FOREACH(cmsg, &msghdr) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + + fd_array = (int*) CMSG_DATA(cmsg); + n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + + ucred = (struct ucred*) CMSG_DATA(cmsg); + } + } + + if (n_fds > 0) { + assert(fd_array); + + r = fdset_new_array(&fds, fd_array, n_fds); + if (r < 0) { + close_many(fd_array, n_fds); + log_oom(); + return 0; + } + } + + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } + + if ((size_t) n >= sizeof(buf) || (msghdr.msg_flags & MSG_TRUNC)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + /* As extra safety check, let's make sure the string we get doesn't contain embedded NUL bytes. We permit one + * trailing NUL byte in the message, but don't expect it. */ + if (n > 1 && memchr(buf, 0, n-1)) { + log_warning("Received notify message with embedded NUL bytes. Ignoring."); + return 0; + } + + /* Make sure it's NUL-terminated. */ + buf[n] = 0; + + /* Increase the generation counter used for filtering out duplicate unit invocations. */ + m->notifygen++; + + /* Notify every unit that might be interested, which might be multiple. */ + u1 = manager_get_unit_by_pid_cgroup(m, ucred->pid); + u2 = hashmap_get(m->watch_pids, PID_TO_PTR(ucred->pid)); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-ucred->pid)); + if (array) { + size_t k = 0; + + while (array[k]) + k++; + + array_copy = newdup(Unit*, array, k+1); + if (!array_copy) + log_oom(); + } + /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle duplicate units + * make sure we only invoke each unit's handler once. */ + if (u1) { + manager_invoke_notify_message(m, u1, ucred, buf, fds); + found = true; + } + if (u2) { + manager_invoke_notify_message(m, u2, ucred, buf, fds); + found = true; + } + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) { + manager_invoke_notify_message(m, array_copy[i], ucred, buf, fds); + found = true; + } + + if (!found) + log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid); + + if (fdset_size(fds) > 0) + log_warning("Got extra auxiliary fds with notification message, closing them."); + + return 0; +} + +static void manager_invoke_sigchld_event( + Manager *m, + Unit *u, + const siginfo_t *si) { + + assert(m); + assert(u); + assert(si); + + /* Already invoked the handler of this unit in this iteration? Then don't process this again */ + if (u->sigchldgen == m->sigchldgen) + return; + u->sigchldgen = m->sigchldgen; + + log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id); + unit_unwatch_pid(u, si->si_pid); + + if (UNIT_VTABLE(u)->sigchld_event) + UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); +} + +static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { + Manager *m = userdata; + siginfo_t si = {}; + int r; + + assert(source); + assert(m); + + /* First we call waitd() for a PID and do not reap the zombie. That way we can still access /proc/$PID for it + * while it is a zombie. */ + + if (waitid(P_ALL, 0, &si, WEXITED|WNOHANG|WNOWAIT) < 0) { + + if (errno != ECHILD) + log_error_errno(errno, "Failed to peek for child with waitid(), ignoring: %m"); + + goto turn_off; + } + + if (si.si_pid <= 0) + goto turn_off; + + if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) { + _cleanup_free_ Unit **array_copy = NULL; + _cleanup_free_ char *name = NULL; + Unit *u1, *u2, **array; + + (void) get_process_comm(si.si_pid, &name); + + log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)", + si.si_pid, strna(name), + sigchld_code_to_string(si.si_code), + si.si_status, + strna(si.si_code == CLD_EXITED + ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL) + : signal_to_string(si.si_status))); + + /* Increase the generation counter used for filtering out duplicate unit invocations */ + m->sigchldgen++; + + /* And now figure out the unit this belongs to, it might be multiple... */ + u1 = manager_get_unit_by_pid_cgroup(m, si.si_pid); + u2 = hashmap_get(m->watch_pids, PID_TO_PTR(si.si_pid)); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-si.si_pid)); + if (array) { + size_t n = 0; + + /* Cound how many entries the array has */ + while (array[n]) + n++; + + /* Make a copy of the array so that we don't trip up on the array changing beneath us */ + array_copy = newdup(Unit*, array, n+1); + if (!array_copy) + log_oom(); + } + + /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but + * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for + * each iteration. */ + if (u1) + manager_invoke_sigchld_event(m, u1, &si); + if (u2) + manager_invoke_sigchld_event(m, u2, &si); + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) + manager_invoke_sigchld_event(m, array_copy[i], &si); + } + + /* And now, we actually reap the zombie. */ + if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) { + log_error_errno(errno, "Failed to dequeue child, ignoring: %m"); + return 0; + } + + return 0; + +turn_off: + /* All children processed for now, turn off event source */ + + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to disable SIGCHLD event source: %m"); + + return 0; +} + +static void manager_start_target(Manager *m, const char *name, JobMode mode) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + log_debug("Activating special unit %s", name); + + r = manager_add_job_by_name(m, JOB_START, name, mode, &error, NULL); + if (r < 0) + log_error("Failed to enqueue %s job: %s", name, bus_error_message(&error, r)); +} + +static void manager_handle_ctrl_alt_del(Manager *m) { + /* If the user presses C-A-D more than + * 7 times within 2s, we reboot/shutdown immediately, + * unless it was disabled in system.conf */ + + if (ratelimit_below(&m->ctrl_alt_del_ratelimit) || m->cad_burst_action == EMERGENCY_ACTION_NONE) + manager_start_target(m, SPECIAL_CTRL_ALT_DEL_TARGET, JOB_REPLACE_IRREVERSIBLY); + else + emergency_action(m, m->cad_burst_action, EMERGENCY_ACTION_WARN, NULL, -1, + "Ctrl-Alt-Del was pressed more than 7 times within 2s"); +} + +static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + ssize_t n; + struct signalfd_siginfo sfsi; + int r; + + assert(m); + assert(m->signal_fd == fd); + + if (revents != EPOLLIN) { + log_warning("Got unexpected events from signal file descriptor."); + return 0; + } + + n = read(m->signal_fd, &sfsi, sizeof(sfsi)); + if (n != sizeof(sfsi)) { + if (n >= 0) { + log_warning("Truncated read from signal fd (%zu bytes), ignoring!", n); + return 0; + } + + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + /* We return an error here, which will kill this handler, + * to avoid a busy loop on read error. */ + return log_error_errno(errno, "Reading from signal fd failed: %m"); + } + + log_received_signal(sfsi.ssi_signo == SIGCHLD || + (sfsi.ssi_signo == SIGTERM && MANAGER_IS_USER(m)) + ? LOG_DEBUG : LOG_INFO, + &sfsi); + + switch (sfsi.ssi_signo) { + + case SIGCHLD: + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to enable SIGCHLD event source, ignoring: %m"); + + break; + + case SIGTERM: + if (MANAGER_IS_SYSTEM(m)) { + /* This is for compatibility with the original sysvinit */ + if (verify_run_space_and_log("Refusing to reexecute") < 0) + break; + + m->objective = MANAGER_REEXECUTE; + break; + } + + _fallthrough_; + case SIGINT: + if (MANAGER_IS_SYSTEM(m)) + manager_handle_ctrl_alt_del(m); + else + manager_start_target(m, SPECIAL_EXIT_TARGET, + JOB_REPLACE_IRREVERSIBLY); + break; + + case SIGWINCH: + /* This is a nop on non-init */ + if (MANAGER_IS_SYSTEM(m)) + manager_start_target(m, SPECIAL_KBREQUEST_TARGET, JOB_REPLACE); + + break; + + case SIGPWR: + /* This is a nop on non-init */ + if (MANAGER_IS_SYSTEM(m)) + manager_start_target(m, SPECIAL_SIGPWR_TARGET, JOB_REPLACE); + + break; + + case SIGUSR1: + if (manager_dbus_is_running(m, false)) { + log_info("Trying to reconnect to bus..."); + + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } else { + log_info("Starting D-Bus service..."); + manager_start_target(m, SPECIAL_DBUS_SERVICE, JOB_REPLACE); + } + + break; + + case SIGUSR2: { + _cleanup_free_ char *dump = NULL; + + r = manager_get_dump_string(m, &dump); + if (r < 0) { + log_warning_errno(errno, "Failed to acquire manager dump: %m"); + break; + } + + log_dump(LOG_INFO, dump); + break; + } + + case SIGHUP: + if (verify_run_space_and_log("Refusing to reload") < 0) + break; + + m->objective = MANAGER_RELOAD; + break; + + default: { + + /* Starting SIGRTMIN+0 */ + static const struct { + const char *target; + JobMode mode; + } target_table[] = { + [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE }, + [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE }, + [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE }, + [3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [6] = { SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY }, + }; + + /* Starting SIGRTMIN+13, so that target halt and system halt are 10 apart */ + static const ManagerObjective objective_table[] = { + [0] = MANAGER_HALT, + [1] = MANAGER_POWEROFF, + [2] = MANAGER_REBOOT, + [3] = MANAGER_KEXEC, + }; + + if ((int) sfsi.ssi_signo >= SIGRTMIN+0 && + (int) sfsi.ssi_signo < SIGRTMIN+(int) ELEMENTSOF(target_table)) { + int idx = (int) sfsi.ssi_signo - SIGRTMIN; + manager_start_target(m, target_table[idx].target, + target_table[idx].mode); + break; + } + + if ((int) sfsi.ssi_signo >= SIGRTMIN+13 && + (int) sfsi.ssi_signo < SIGRTMIN+13+(int) ELEMENTSOF(objective_table)) { + m->objective = objective_table[sfsi.ssi_signo - SIGRTMIN - 13]; + break; + } + + switch (sfsi.ssi_signo - SIGRTMIN) { + + case 20: + manager_set_show_status(m, SHOW_STATUS_YES); + break; + + case 21: + manager_set_show_status(m, SHOW_STATUS_NO); + break; + + case 22: + manager_override_log_level(m, LOG_DEBUG); + break; + + case 23: + manager_restore_original_log_level(m); + break; + + case 24: + if (MANAGER_IS_USER(m)) { + m->objective = MANAGER_EXIT; + return 0; + } + + /* This is a nop on init */ + break; + + case 26: + case 29: /* compatibility: used to be mapped to LOG_TARGET_SYSLOG_OR_KMSG */ + manager_restore_original_log_target(m); + break; + + case 27: + manager_override_log_target(m, LOG_TARGET_CONSOLE); + break; + + case 28: + manager_override_log_target(m, LOG_TARGET_KMSG); + break; + + default: + log_warning("Got unhandled signal <%s>.", signal_to_string(sfsi.ssi_signo)); + } + }} + + return 0; +} + +static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + Iterator i; + Unit *u; + + assert(m); + assert(m->time_change_fd == fd); + + log_struct(LOG_DEBUG, + "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR, + LOG_MESSAGE("Time has been changed")); + + /* Restart the watch */ + (void) manager_setup_time_change(m); + + HASHMAP_FOREACH(u, m->units, i) + if (UNIT_VTABLE(u)->time_change) + UNIT_VTABLE(u)->time_change(u); + + return 0; +} + +static int manager_dispatch_timezone_change( + sd_event_source *source, + const struct inotify_event *e, + void *userdata) { + + Manager *m = userdata; + int changed; + Iterator i; + Unit *u; + + assert(m); + + log_debug("inotify event for /etc/localtime"); + + changed = manager_read_timezone_stat(m); + if (changed <= 0) + return changed; + + /* Something changed, restart the watch, to ensure we watch the new /etc/localtime if it changed */ + (void) manager_setup_timezone_change(m); + + /* Read the new timezone */ + tzset(); + + log_debug("Timezone has been changed (now: %s).", tzname[daylight]); + + HASHMAP_FOREACH(u, m->units, i) + if (UNIT_VTABLE(u)->timezone_change) + UNIT_VTABLE(u)->timezone_change(u); + + return 0; +} + +static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(m); + assert(m->idle_pipe[2] == fd); + + /* There's at least one Type=idle child that just gave up on us waiting for the boot process to complete. Let's + * now turn off any further console output if there's at least one service that needs console access, so that + * from now on our own output should not spill into that service's output anymore. After all, we support + * Type=idle only to beautify console output and it generally is set on services that want to own the console + * exclusively without our interference. */ + m->no_console_output = m->n_on_console > 0; + + /* Acknowledge the child's request, and let all all other children know too that they shouldn't wait any longer + * by closing the pipes towards them, which is what they are waiting for. */ + manager_close_idle_pipe(m); + + return 0; +} + +static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata) { + Manager *m = userdata; + int r; + uint64_t next; + + assert(m); + assert(source); + + manager_print_jobs_in_progress(m); + + next = now(CLOCK_MONOTONIC) + JOBS_IN_PROGRESS_PERIOD_USEC; + r = sd_event_source_set_time(source, next); + if (r < 0) + return r; + + return sd_event_source_set_enabled(source, SD_EVENT_ONESHOT); +} + +int manager_loop(Manager *m) { + int r; + + RATELIMIT_DEFINE(rl, 1*USEC_PER_SEC, 50000); + + assert(m); + assert(m->objective == MANAGER_OK); /* Ensure manager_startup() has been called */ + + /* Release the path cache */ + m->unit_path_cache = set_free_free(m->unit_path_cache); + + manager_check_finished(m); + + /* There might still be some zombies hanging around from before we were exec()'ed. Let's reap them. */ + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to enable SIGCHLD event source: %m"); + + while (m->objective == MANAGER_OK) { + usec_t wait_usec; + + if (m->runtime_watchdog > 0 && m->runtime_watchdog != USEC_INFINITY && MANAGER_IS_SYSTEM(m)) + watchdog_ping(); + + if (!ratelimit_below(&rl)) { + /* Yay, something is going seriously wrong, pause a little */ + log_warning("Looping too fast. Throttling execution a little."); + sleep(1); + } + + if (manager_dispatch_load_queue(m) > 0) + continue; + + if (manager_dispatch_gc_job_queue(m) > 0) + continue; + + if (manager_dispatch_gc_unit_queue(m) > 0) + continue; + + if (manager_dispatch_cleanup_queue(m) > 0) + continue; + + if (manager_dispatch_cgroup_realize_queue(m) > 0) + continue; + + if (manager_dispatch_stop_when_unneeded_queue(m) > 0) + continue; + + if (manager_dispatch_dbus_queue(m) > 0) + continue; + + /* Sleep for half the watchdog time */ + if (m->runtime_watchdog > 0 && m->runtime_watchdog != USEC_INFINITY && MANAGER_IS_SYSTEM(m)) { + wait_usec = m->runtime_watchdog / 2; + if (wait_usec <= 0) + wait_usec = 1; + } else + wait_usec = USEC_INFINITY; + + r = sd_event_run(m->event, wait_usec); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + } + + return m->objective; +} + +int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) { + _cleanup_free_ char *n = NULL; + sd_id128_t invocation_id; + Unit *u; + int r; + + assert(m); + assert(s); + assert(_u); + + r = unit_name_from_dbus_path(s, &n); + if (r < 0) + return r; + + /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128bit ID then we use it + * as invocation ID. */ + r = sd_id128_from_string(n, &invocation_id); + if (r >= 0) { + u = hashmap_get(m->units_by_invocation_id, &invocation_id); + if (u) { + *_u = u; + return 0; + } + + return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, + "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", + SD_ID128_FORMAT_VAL(invocation_id)); + } + + /* If this didn't work, we check if this is a unit name */ + if (!unit_name_is_valid(n, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + _cleanup_free_ char *nn = NULL; + + nn = cescape(n); + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, + "Unit name %s is neither a valid invocation ID nor unit name.", strnull(nn)); + } + + r = manager_load_unit(m, n, NULL, e, &u); + if (r < 0) + return r; + + *_u = u; + return 0; +} + +int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j) { + const char *p; + unsigned id; + Job *j; + int r; + + assert(m); + assert(s); + assert(_j); + + p = startswith(s, "/org/freedesktop/systemd1/job/"); + if (!p) + return -EINVAL; + + r = safe_atou(p, &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return -ENOENT; + + *_j = j; + + return 0; +} + +void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) { + +#if HAVE_AUDIT + _cleanup_free_ char *p = NULL; + const char *msg; + int audit_fd, r; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + audit_fd = get_audit_fd(); + if (audit_fd < 0) + return; + + /* Don't generate audit events if the service was already + * started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + + if (u->type != UNIT_SERVICE) + return; + + r = unit_name_to_prefix_and_instance(u->id, &p); + if (r < 0) { + log_error_errno(r, "Failed to extract prefix and instance of unit name: %m"); + return; + } + + msg = strjoina("unit=", p); + if (audit_log_user_comm_message(audit_fd, type, msg, "systemd", NULL, NULL, NULL, success) < 0) { + if (errno == EPERM) + /* We aren't allowed to send audit messages? + * Then let's not retry again. */ + close_audit_fd(); + else + log_warning_errno(errno, "Failed to send audit message: %m"); + } +#endif + +} + +void manager_send_unit_plymouth(Manager *m, Unit *u) { + static const union sockaddr_union sa = PLYMOUTH_SOCKET; + _cleanup_free_ char *message = NULL; + _cleanup_close_ int fd = -1; + int n = 0; + + /* Don't generate plymouth events if the service was already + * started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (detect_container() > 0) + return; + + if (!IN_SET(u->type, UNIT_SERVICE, UNIT_MOUNT, UNIT_SWAP)) + return; + + /* We set SOCK_NONBLOCK here so that we rather drop the + * message then wait for plymouth */ + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) { + log_error_errno(errno, "socket() failed: %m"); + return; + } + + if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) { + if (!IN_SET(errno, EPIPE, EAGAIN, ENOENT, ECONNREFUSED, ECONNRESET, ECONNABORTED)) + log_error_errno(errno, "connect() failed: %m"); + return; + } + + if (asprintf(&message, "U\002%c%s%n", (int) (strlen(u->id) + 1), u->id, &n) < 0) { + log_oom(); + return; + } + + errno = 0; + if (write(fd, message, n + 1) != n + 1) + if (!IN_SET(errno, EPIPE, EAGAIN, ENOENT, ECONNREFUSED, ECONNRESET, ECONNABORTED)) + log_error_errno(errno, "Failed to write Plymouth message: %m"); +} + +int manager_open_serialization(Manager *m, FILE **_f) { + int fd; + FILE *f; + + assert(_f); + + fd = open_serialization_fd("systemd-state"); + if (fd < 0) + return fd; + + f = fdopen(fd, "w+"); + if (!f) { + safe_close(fd); + return -errno; + } + + *_f = f; + return 0; +} + +static bool manager_timestamp_shall_serialize(ManagerTimestamp t) { + + if (!in_initrd()) + return true; + + /* The following timestamps only apply to the host system, hence only serialize them there */ + return !IN_SET(t, + MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH, + MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH, + MANAGER_TIMESTAMP_GENERATORS_START, MANAGER_TIMESTAMP_GENERATORS_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD_START, MANAGER_TIMESTAMP_UNITS_LOAD_FINISH); +} + +int manager_serialize( + Manager *m, + FILE *f, + FDSet *fds, + bool switching_root) { + + ManagerTimestamp q; + const char *t; + Iterator i; + Unit *u; + int r; + + assert(m); + assert(f); + assert(fds); + + _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m); + + (void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id); + (void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs); + (void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs); + (void) serialize_bool(f, "taint-usr", m->taint_usr); + (void) serialize_bool(f, "ready-sent", m->ready_sent); + (void) serialize_bool(f, "taint-logged", m->taint_logged); + (void) serialize_bool(f, "service-watchdogs", m->service_watchdogs); + + t = show_status_to_string(m->show_status); + if (t) + (void) serialize_item(f, "show-status", t); + + if (m->log_level_overridden) + (void) serialize_item_format(f, "log-level-override", "%i", log_get_max_level()); + if (m->log_target_overridden) + (void) serialize_item(f, "log-target-override", log_target_to_string(log_get_target())); + + for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + _cleanup_free_ char *joined = NULL; + + if (!manager_timestamp_shall_serialize(q)) + continue; + + joined = strjoin(manager_timestamp_to_string(q), "-timestamp"); + if (!joined) + return log_oom(); + + (void) serialize_dual_timestamp(f, joined, m->timestamps + q); + } + + if (!switching_root) + (void) serialize_strv(f, "env", m->client_environment); + + if (m->notify_fd >= 0) { + r = serialize_fd(f, fds, "notify-fd", m->notify_fd); + if (r < 0) + return r; + + (void) serialize_item(f, "notify-socket", m->notify_socket); + } + + if (m->cgroups_agent_fd >= 0) { + r = serialize_fd(f, fds, "cgroups-agent-fd", m->cgroups_agent_fd); + if (r < 0) + return r; + } + + if (m->user_lookup_fds[0] >= 0) { + int copy0, copy1; + + copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]); + if (copy0 < 0) + return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m"); + + copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]); + if (copy1 < 0) + return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m"); + + (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1); + } + + bus_track_serialize(m->subscribed, f, "subscribed"); + + r = dynamic_user_serialize(m, f, fds); + if (r < 0) + return r; + + manager_serialize_uid_refs(m, f); + manager_serialize_gid_refs(m, f); + + r = exec_runtime_serialize(m, f, fds); + if (r < 0) + return r; + + (void) fputc('\n', f); + + HASHMAP_FOREACH_KEY(u, t, m->units, i) { + if (u->id != t) + continue; + + /* Start marker */ + fputs(u->id, f); + fputc('\n', f); + + r = unit_serialize(u, f, fds, !switching_root); + if (r < 0) + return r; + } + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to flush serialization: %m"); + + r = bus_fdset_add_all(m, fds); + if (r < 0) + return log_error_errno(r, "Failed to add bus sockets to serialization: %m"); + + return 0; +} + +static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) { + Unit *u; + int r; + + r = manager_load_unit(m, name, NULL, NULL, &u); + if (r < 0) { + if (r == -ENOMEM) + return r; + return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name); + } + + r = unit_deserialize(u, f, fds); + if (r < 0) { + if (r == -ENOMEM) + return r; + return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name); + } + + return 0; +} + +static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) { + const char *unit_name; + int r; + + for (;;) { + _cleanup_free_ char *line = NULL; + /* Start marker */ + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + break; + + unit_name = strstrip(line); + + r = manager_deserialize_one_unit(m, unit_name, f, fds); + if (r == -ENOMEM) + return r; + if (r < 0) { + r = unit_deserialize_skip(f); + if (r < 0) + return r; + } + } + + return 0; +} + +int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { + int r = 0; + + assert(m); + assert(f); + + log_debug("Deserializing state..."); + + /* If we are not in reload mode yet, enter it now. Not that this is recursive, a caller might already have + * increased it to non-zero, which is why we just increase it by one here and down again at the end of this + * call. */ + _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m); + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *val, *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + break; + + l = strstrip(line); + if (isempty(l)) /* end marker */ + break; + + if ((val = startswith(l, "current-job-id="))) { + uint32_t id; + + if (safe_atou32(val, &id) < 0) + log_notice("Failed to parse current job id value '%s', ignoring.", val); + else + m->current_job_id = MAX(m->current_job_id, id); + + } else if ((val = startswith(l, "n-installed-jobs="))) { + uint32_t n; + + if (safe_atou32(val, &n) < 0) + log_notice("Failed to parse installed jobs counter '%s', ignoring.", val); + else + m->n_installed_jobs += n; + + } else if ((val = startswith(l, "n-failed-jobs="))) { + uint32_t n; + + if (safe_atou32(val, &n) < 0) + log_notice("Failed to parse failed jobs counter '%s', ignoring.", val); + else + m->n_failed_jobs += n; + + } else if ((val = startswith(l, "taint-usr="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse taint /usr flag '%s', ignoring.", val); + else + m->taint_usr = m->taint_usr || b; + + } else if ((val = startswith(l, "ready-sent="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse ready-sent flag '%s', ignoring.", val); + else + m->ready_sent = m->ready_sent || b; + + } else if ((val = startswith(l, "taint-logged="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse taint-logged flag '%s', ignoring.", val); + else + m->taint_logged = m->taint_logged || b; + + } else if ((val = startswith(l, "service-watchdogs="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val); + else + m->service_watchdogs = b; + + } else if ((val = startswith(l, "show-status="))) { + ShowStatus s; + + s = show_status_from_string(val); + if (s < 0) + log_notice("Failed to parse show-status flag '%s', ignoring.", val); + else + manager_set_show_status(m, s); + + } else if ((val = startswith(l, "log-level-override="))) { + int level; + + level = log_level_from_string(val); + if (level < 0) + log_notice("Failed to parse log-level-override value '%s', ignoring.", val); + else + manager_override_log_level(m, level); + + } else if ((val = startswith(l, "log-target-override="))) { + LogTarget target; + + target = log_target_from_string(val); + if (target < 0) + log_notice("Failed to parse log-target-override value '%s', ignoring.", val); + else + manager_override_log_target(m, target); + + } else if (startswith(l, "env=")) { + r = deserialize_environment(l + 4, &m->client_environment); + if (r < 0) + log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l); + + } else if ((val = startswith(l, "notify-fd="))) { + int fd; + + if (safe_atoi(val, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_notice("Failed to parse notify fd, ignoring: \"%s\"", val); + else { + m->notify_event_source = sd_event_source_unref(m->notify_event_source); + safe_close(m->notify_fd); + m->notify_fd = fdset_remove(fds, fd); + } + + } else if ((val = startswith(l, "notify-socket="))) { + r = free_and_strdup(&m->notify_socket, val); + if (r < 0) + return r; + + } else if ((val = startswith(l, "cgroups-agent-fd="))) { + int fd; + + if (safe_atoi(val, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_notice("Failed to parse cgroups agent fd, ignoring.: %s", val); + else { + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + safe_close(m->cgroups_agent_fd); + m->cgroups_agent_fd = fdset_remove(fds, fd); + } + + } else if ((val = startswith(l, "user-lookup="))) { + int fd0, fd1; + + if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1)) + log_notice("Failed to parse user lookup fd, ignoring: %s", val); + else { + m->user_lookup_event_source = sd_event_source_unref(m->user_lookup_event_source); + safe_close_pair(m->user_lookup_fds); + m->user_lookup_fds[0] = fdset_remove(fds, fd0); + m->user_lookup_fds[1] = fdset_remove(fds, fd1); + } + + } else if ((val = startswith(l, "dynamic-user="))) + dynamic_user_deserialize_one(m, val, fds); + else if ((val = startswith(l, "destroy-ipc-uid="))) + manager_deserialize_uid_refs_one(m, val); + else if ((val = startswith(l, "destroy-ipc-gid="))) + manager_deserialize_gid_refs_one(m, val); + else if ((val = startswith(l, "exec-runtime="))) + exec_runtime_deserialize_one(m, val, fds); + else if ((val = startswith(l, "subscribed="))) { + + if (strv_extend(&m->deserialized_subscribed, val) < 0) + return -ENOMEM; + + } else { + ManagerTimestamp q; + + for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + val = startswith(l, manager_timestamp_to_string(q)); + if (!val) + continue; + + val = startswith(val, "-timestamp="); + if (val) + break; + } + + if (q < _MANAGER_TIMESTAMP_MAX) /* found it */ + (void) deserialize_dual_timestamp(val, m->timestamps + q); + else if (!startswith(l, "kdbus-fd=")) /* ignore kdbus */ + log_notice("Unknown serialization item '%s', ignoring.", l); + } + } + + return manager_deserialize_units(m, f, fds); +} + +int manager_reload(Manager *m) { + _cleanup_(manager_reloading_stopp) Manager *reloading = NULL; + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + /* We are officially in reload mode from here on. */ + reloading = manager_reloading_start(m); + + r = manager_serialize(m, f, fds, false); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to beginning of serialization: %m"); + + /* 💀 This is the point of no return, from here on there is no way back. 💀 */ + reloading = NULL; + + bus_manager_send_reloading(m, true); + + /* Start by flushing out all jobs and units, all generated units, all runtime environments, all dynamic users + * and everything else that is worth flushing out. We'll get it all back from the serialization — if we need + * it.*/ + + manager_clear_jobs_and_units(m); + lookup_paths_flush_generator(&m->lookup_paths); + lookup_paths_free(&m->lookup_paths); + exec_runtime_vacuum(m); + dynamic_user_vacuum(m, false); + m->uid_refs = hashmap_free(m->uid_refs); + m->gid_refs = hashmap_free(m->gid_refs); + + r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL); + if (r < 0) + log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m"); + + (void) manager_run_environment_generators(m); + (void) manager_run_generators(m); + + r = lookup_paths_reduce(&m->lookup_paths); + if (r < 0) + log_warning_errno(r, "Failed ot reduce unit file paths, ignoring: %m"); + + manager_build_unit_path_cache(m); + + /* First, enumerate what we can from kernel and suchlike */ + manager_enumerate_perpetual(m); + manager_enumerate(m); + + /* Second, deserialize our stored data */ + r = manager_deserialize(m, f, fds); + if (r < 0) + log_warning_errno(r, "Deserialization failed, proceeding anyway: %m"); + + /* We don't need the serialization anymore */ + f = safe_fclose(f); + + /* Re-register notify_fd as event source, and set up other sockets/communication channels we might need */ + (void) manager_setup_notify(m); + (void) manager_setup_cgroups_agent(m); + (void) manager_setup_user_lookup_fd(m); + + /* Third, fire things up! */ + manager_coldplug(m); + + /* Clean up runtime objects no longer referenced */ + manager_vacuum(m); + + /* Consider the reload process complete now. */ + assert(m->n_reloading > 0); + m->n_reloading--; + + manager_ready(m); + + m->send_reloading_done = true; + return 0; +} + +void manager_reset_failed(Manager *m) { + Unit *u; + Iterator i; + + assert(m); + + HASHMAP_FOREACH(u, m->units, i) + unit_reset_failed(u); +} + +bool manager_unit_inactive_or_pending(Manager *m, const char *name) { + Unit *u; + + assert(m); + assert(name); + + /* Returns true if the unit is inactive or going down */ + u = manager_get_unit(m, name); + if (!u) + return true; + + return unit_inactive_or_pending(u); +} + +static void log_taint_string(Manager *m) { + _cleanup_free_ char *taint = NULL; + + assert(m); + + if (MANAGER_IS_USER(m) || m->taint_logged) + return; + + m->taint_logged = true; /* only check for taint once */ + + taint = manager_taint_string(m); + if (isempty(taint)) + return; + + log_struct(LOG_NOTICE, + LOG_MESSAGE("System is tainted: %s", taint), + "TAINT=%s", taint, + "MESSAGE_ID=" SD_MESSAGE_TAINTED_STR); +} + +static void manager_notify_finished(Manager *m) { + char userspace[FORMAT_TIMESPAN_MAX], initrd[FORMAT_TIMESPAN_MAX], kernel[FORMAT_TIMESPAN_MAX], sum[FORMAT_TIMESPAN_MAX]; + usec_t firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec; + + if (MANAGER_IS_TEST_RUN(m)) + return; + + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) { + char ts[FORMAT_TIMESPAN_MAX]; + char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")] + = {}; + char *p = buf; + size_t size = sizeof buf; + + /* Note that MANAGER_TIMESTAMP_KERNEL's monotonic value is always at 0, and + * MANAGER_TIMESTAMP_FIRMWARE's and MANAGER_TIMESTAMP_LOADER's monotonic value should be considered + * negative values. */ + + firmware_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic - m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic; + loader_usec = m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + total_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic + m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic; + + if (firmware_usec > 0) + size = strpcpyf(&p, size, "%s (firmware) + ", format_timespan(ts, sizeof(ts), firmware_usec, USEC_PER_MSEC)); + if (loader_usec > 0) + size = strpcpyf(&p, size, "%s (loader) + ", format_timespan(ts, sizeof(ts), loader_usec, USEC_PER_MSEC)); + + if (dual_timestamp_is_set(&m->timestamps[MANAGER_TIMESTAMP_INITRD])) { + + /* The initrd case on bare-metal*/ + kernel_usec = m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + initrd_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "KERNEL_USEC="USEC_FMT, kernel_usec, + "INITRD_USEC="USEC_FMT, initrd_usec, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (initrd) + %s (userspace) = %s.", + buf, + format_timespan(kernel, sizeof(kernel), kernel_usec, USEC_PER_MSEC), + format_timespan(initrd, sizeof(initrd), initrd_usec, USEC_PER_MSEC), + format_timespan(userspace, sizeof(userspace), userspace_usec, USEC_PER_MSEC), + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC))); + } else { + /* The initrd-less case on bare-metal*/ + + kernel_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + initrd_usec = 0; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "KERNEL_USEC="USEC_FMT, kernel_usec, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (userspace) = %s.", + buf, + format_timespan(kernel, sizeof(kernel), kernel_usec, USEC_PER_MSEC), + format_timespan(userspace, sizeof(userspace), userspace_usec, USEC_PER_MSEC), + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC))); + } + } else { + /* The container and --user case */ + firmware_usec = loader_usec = initrd_usec = kernel_usec = 0; + total_usec = userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_USER_STARTUP_FINISHED_STR, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s.", + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC))); + } + + bus_manager_send_finished(m, firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec); + + sd_notifyf(false, + m->ready_sent ? "STATUS=Startup finished in %s." + : "READY=1\n" + "STATUS=Startup finished in %s.", + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC)); + m->ready_sent = true; + + log_taint_string(m); +} + +static void manager_send_ready(Manager *m) { + assert(m); + + /* We send READY=1 on reaching basic.target only when running in --user mode. */ + if (!MANAGER_IS_USER(m) || m->ready_sent) + return; + + m->ready_sent = true; + + sd_notifyf(false, + "READY=1\n" + "STATUS=Reached " SPECIAL_BASIC_TARGET "."); +} + +static void manager_check_basic_target(Manager *m) { + Unit *u; + + assert(m); + + /* Small shortcut */ + if (m->ready_sent && m->taint_logged) + return; + + u = manager_get_unit(m, SPECIAL_BASIC_TARGET); + if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + /* For user managers, send out READY=1 as soon as we reach basic.target */ + manager_send_ready(m); + + /* Log the taint string as soon as we reach basic.target */ + log_taint_string(m); +} + +void manager_check_finished(Manager *m) { + assert(m); + + if (MANAGER_IS_RELOADING(m)) + return; + + /* Verify that we have entered the event loop already, and not left it again. */ + if (!MANAGER_IS_RUNNING(m)) + return; + + manager_check_basic_target(m); + + if (hashmap_size(m->jobs) > 0) { + if (m->jobs_in_progress_event_source) + /* Ignore any failure, this is only for feedback */ + (void) sd_event_source_set_time(m->jobs_in_progress_event_source, now(CLOCK_MONOTONIC) + JOBS_IN_PROGRESS_WAIT_USEC); + + return; + } + + manager_flip_auto_status(m, false); + + /* Notify Type=idle units that we are done now */ + manager_close_idle_pipe(m); + + /* Turn off confirm spawn now */ + m->confirm_spawn = NULL; + + /* No need to update ask password status when we're going non-interactive */ + manager_close_ask_password(m); + + /* This is no longer the first boot */ + manager_set_first_boot(m, false); + + if (MANAGER_IS_FINISHED(m)) + return; + + dual_timestamp_get(m->timestamps + MANAGER_TIMESTAMP_FINISH); + + manager_notify_finished(m); + + manager_invalidate_startup_units(m); +} + +static bool generator_path_any(const char* const* paths) { + char **path; + bool found = false; + + /* Optimize by skipping the whole process by not creating output directories + * if no generators are found. */ + STRV_FOREACH(path, (char**) paths) + if (access(*path, F_OK) == 0) + found = true; + else if (errno != ENOENT) + log_warning_errno(errno, "Failed to open generator directory %s: %m", *path); + + return found; +} + +static const char* system_env_generator_binary_paths[] = { + "/run/systemd/system-environment-generators", + "/etc/systemd/system-environment-generators", + "/usr/local/lib/systemd/system-environment-generators", + SYSTEM_ENV_GENERATOR_PATH, + NULL +}; + +static const char* user_env_generator_binary_paths[] = { + "/run/systemd/user-environment-generators", + "/etc/systemd/user-environment-generators", + "/usr/local/lib/systemd/user-environment-generators", + USER_ENV_GENERATOR_PATH, + NULL +}; + +static int manager_run_environment_generators(Manager *m) { + char **tmp = NULL; /* this is only used in the forked process, no cleanup here */ + const char **paths; + void* args[] = { + [STDOUT_GENERATE] = &tmp, + [STDOUT_COLLECT] = &tmp, + [STDOUT_CONSUME] = &m->transient_environment, + }; + int r; + + if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_ENV_GENERATORS)) + return 0; + + paths = MANAGER_IS_SYSTEM(m) ? system_env_generator_binary_paths : user_env_generator_binary_paths; + + if (!generator_path_any(paths)) + return 0; + + RUN_WITH_UMASK(0022) + r = execute_directories(paths, DEFAULT_TIMEOUT_USEC, gather_environment, args, NULL, m->transient_environment); + + return r; +} + +static int manager_run_generators(Manager *m) { + _cleanup_strv_free_ char **paths = NULL; + const char *argv[5]; + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_GENERATORS)) + return 0; + + paths = generator_binary_paths(m->unit_file_scope); + if (!paths) + return log_oom(); + + if (!generator_path_any((const char* const*) paths)) + return 0; + + r = lookup_paths_mkdir_generator(&m->lookup_paths); + if (r < 0) { + log_error_errno(r, "Failed to create generator directories: %m"); + goto finish; + } + + argv[0] = NULL; /* Leave this empty, execute_directory() will fill something in */ + argv[1] = m->lookup_paths.generator; + argv[2] = m->lookup_paths.generator_early; + argv[3] = m->lookup_paths.generator_late; + argv[4] = NULL; + + RUN_WITH_UMASK(0022) + (void) execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, + NULL, NULL, (char**) argv, m->transient_environment); + + r = 0; + +finish: + lookup_paths_trim_generator(&m->lookup_paths); + return r; +} + +int manager_transient_environment_add(Manager *m, char **plus) { + char **a; + + assert(m); + + if (strv_isempty(plus)) + return 0; + + a = strv_env_merge(2, m->transient_environment, plus); + if (!a) + return log_oom(); + + sanitize_environment(a); + + return strv_free_and_replace(m->transient_environment, a); +} + +int manager_client_environment_modify( + Manager *m, + char **minus, + char **plus) { + + char **a = NULL, **b = NULL, **l; + + assert(m); + + if (strv_isempty(minus) && strv_isempty(plus)) + return 0; + + l = m->client_environment; + + if (!strv_isempty(minus)) { + a = strv_env_delete(l, 1, minus); + if (!a) + return -ENOMEM; + + l = a; + } + + if (!strv_isempty(plus)) { + b = strv_env_merge(2, l, plus); + if (!b) { + strv_free(a); + return -ENOMEM; + } + + l = b; + } + + if (m->client_environment != l) + strv_free(m->client_environment); + + if (a != l) + strv_free(a); + if (b != l) + strv_free(b); + + m->client_environment = sanitize_environment(l); + return 0; +} + +int manager_get_effective_environment(Manager *m, char ***ret) { + char **l; + + assert(m); + assert(ret); + + l = strv_env_merge(2, m->transient_environment, m->client_environment); + if (!l) + return -ENOMEM; + + *ret = l; + return 0; +} + +int manager_set_default_rlimits(Manager *m, struct rlimit **default_rlimit) { + int i; + + assert(m); + + for (i = 0; i < _RLIMIT_MAX; i++) { + m->rlimit[i] = mfree(m->rlimit[i]); + + if (!default_rlimit[i]) + continue; + + m->rlimit[i] = newdup(struct rlimit, default_rlimit[i], 1); + if (!m->rlimit[i]) + return log_oom(); + } + + return 0; +} + +void manager_recheck_dbus(Manager *m) { + assert(m); + + /* Connects to the bus if the dbus service and socket are running. If we are running in user mode this is all + * it does. In system mode we'll also connect to the system bus (which will most likely just reuse the + * connection of the API bus). That's because the system bus after all runs as service of the system instance, + * while in the user instance we can assume it's already there. */ + + if (MANAGER_IS_RELOADING(m)) + return; /* don't check while we are reloading… */ + + if (manager_dbus_is_running(m, false)) { + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } else { + (void) bus_done_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_done_system(m); + } +} + +static bool manager_journal_is_running(Manager *m) { + Unit *u; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return false; + + /* If we are the user manager we can safely assume that the journal is up */ + if (!MANAGER_IS_SYSTEM(m)) + return true; + + /* Check that the socket is not only up, but in RUNNING state */ + u = manager_get_unit(m, SPECIAL_JOURNALD_SOCKET); + if (!u) + return false; + if (SOCKET(u)->state != SOCKET_RUNNING) + return false; + + /* Similar, check if the daemon itself is fully up, too */ + u = manager_get_unit(m, SPECIAL_JOURNALD_SERVICE); + if (!u) + return false; + if (!IN_SET(SERVICE(u)->state, SERVICE_RELOAD, SERVICE_RUNNING)) + return false; + + return true; +} + +void manager_recheck_journal(Manager *m) { + + assert(m); + + /* Don't bother with this unless we are in the special situation of being PID 1 */ + if (getpid_cached() != 1) + return; + + /* Don't check this while we are reloading, things might still change */ + if (MANAGER_IS_RELOADING(m)) + return; + + /* The journal is fully and entirely up? If so, let's permit logging to it, if that's configured. If the + * journal is down, don't ever log to it, otherwise we might end up deadlocking ourselves as we might trigger + * an activation ourselves we can't fulfill. */ + log_set_prohibit_ipc(!manager_journal_is_running(m)); + log_open(); +} + +void manager_set_show_status(Manager *m, ShowStatus mode) { + assert(m); + assert(IN_SET(mode, SHOW_STATUS_AUTO, SHOW_STATUS_NO, SHOW_STATUS_YES, SHOW_STATUS_TEMPORARY)); + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (m->show_status != mode) + log_debug("%s showing of status.", + mode == SHOW_STATUS_NO ? "Disabling" : "Enabling"); + m->show_status = mode; + + if (IN_SET(mode, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES)) + (void) touch("/run/systemd/show-status"); + else + (void) unlink("/run/systemd/show-status"); +} + +static bool manager_get_show_status(Manager *m, StatusType type) { + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return false; + + if (m->no_console_output) + return false; + + if (!IN_SET(manager_state(m), MANAGER_INITIALIZING, MANAGER_STARTING, MANAGER_STOPPING)) + return false; + + /* If we cannot find out the status properly, just proceed. */ + if (type != STATUS_TYPE_EMERGENCY && manager_check_ask_password(m) > 0) + return false; + + return IN_SET(m->show_status, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES); +} + +const char *manager_get_confirm_spawn(Manager *m) { + static int last_errno = 0; + const char *vc = m->confirm_spawn; + struct stat st; + int r; + + /* Here's the deal: we want to test the validity of the console but don't want + * PID1 to go through the whole console process which might block. But we also + * want to warn the user only once if something is wrong with the console so we + * cannot do the sanity checks after spawning our children. So here we simply do + * really basic tests to hopefully trap common errors. + * + * If the console suddenly disappear at the time our children will really it + * then they will simply fail to acquire it and a positive answer will be + * assumed. New children will fallback to /dev/console though. + * + * Note: TTYs are devices that can come and go any time, and frequently aren't + * available yet during early boot (consider a USB rs232 dongle...). If for any + * reason the configured console is not ready, we fallback to the default + * console. */ + + if (!vc || path_equal(vc, "/dev/console")) + return vc; + + r = stat(vc, &st); + if (r < 0) + goto fail; + + if (!S_ISCHR(st.st_mode)) { + errno = ENOTTY; + goto fail; + } + + last_errno = 0; + return vc; +fail: + if (last_errno != errno) { + last_errno = errno; + log_warning_errno(errno, "Failed to open %s: %m, using default console", vc); + } + return "/dev/console"; +} + +void manager_set_first_boot(Manager *m, bool b) { + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (m->first_boot != (int) b) { + if (b) + (void) touch("/run/systemd/first-boot"); + else + (void) unlink("/run/systemd/first-boot"); + } + + m->first_boot = b; +} + +void manager_disable_confirm_spawn(void) { + (void) touch("/run/systemd/confirm_spawn_disabled"); +} + +bool manager_is_confirm_spawn_disabled(Manager *m) { + if (!m->confirm_spawn) + return true; + + return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0; +} + +void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) { + va_list ap; + + /* If m is NULL, assume we're after shutdown and let the messages through. */ + + if (m && !manager_get_show_status(m, type)) + return; + + /* XXX We should totally drop the check for ephemeral here + * and thus effectively make 'Type=idle' pointless. */ + if (type == STATUS_TYPE_EPHEMERAL && m && m->n_on_console > 0) + return; + + va_start(ap, format); + status_vprintf(status, SHOW_STATUS_ELLIPSIZE|(type == STATUS_TYPE_EPHEMERAL ? SHOW_STATUS_EPHEMERAL : 0), format, ap); + va_end(ap); +} + +Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path) { + char p[strlen(path)+1]; + + assert(m); + assert(path); + + strcpy(p, path); + path_simplify(p, false); + + return hashmap_get(m->units_requiring_mounts_for, streq(p, "/") ? "" : p); +} + +int manager_update_failed_units(Manager *m, Unit *u, bool failed) { + unsigned size; + int r; + + assert(m); + assert(u->manager == m); + + size = set_size(m->failed_units); + + if (failed) { + r = set_ensure_allocated(&m->failed_units, NULL); + if (r < 0) + return log_oom(); + + if (set_put(m->failed_units, u) < 0) + return log_oom(); + } else + (void) set_remove(m->failed_units, u); + + if (set_size(m->failed_units) != size) + bus_manager_send_change_signal(m); + + return 0; +} + +ManagerState manager_state(Manager *m) { + Unit *u; + + assert(m); + + /* Did we ever finish booting? If not then we are still starting up */ + if (!MANAGER_IS_FINISHED(m)) { + + u = manager_get_unit(m, SPECIAL_BASIC_TARGET); + if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return MANAGER_INITIALIZING; + + return MANAGER_STARTING; + } + + /* Is the special shutdown target active or queued? If so, we are in shutdown state */ + u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_STOPPING; + + if (MANAGER_IS_SYSTEM(m)) { + /* Are the rescue or emergency targets active or queued? If so we are in maintenance state */ + u = manager_get_unit(m, SPECIAL_RESCUE_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_MAINTENANCE; + + u = manager_get_unit(m, SPECIAL_EMERGENCY_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_MAINTENANCE; + } + + /* Are there any failed units? If so, we are in degraded mode */ + if (set_size(m->failed_units) > 0) + return MANAGER_DEGRADED; + + return MANAGER_RUNNING; +} + +#define DESTROY_IPC_FLAG (UINT32_C(1) << 31) + +static void manager_unref_uid_internal( + Manager *m, + Hashmap **uid_refs, + uid_t uid, + bool destroy_now, + int (*_clean_ipc)(uid_t uid)) { + + uint32_t c, n; + + assert(m); + assert(uid_refs); + assert(uid_is_valid(uid)); + assert(_clean_ipc); + + /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the assumption + * that uid_t and gid_t are actually defined the same way, with the same validity rules. + * + * We store a hashmap where the UID/GID is they key and the value is a 32bit reference counter, whose highest + * bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last reference to the UID/GID + * is dropped. The flag is set to on, once at least one reference from a unit where RemoveIPC= is set is added + * on a UID/GID. It is reset when the UID's/GID's reference counter drops to 0 again. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + assert(n > 0); + n--; + + if (destroy_now && n == 0) { + hashmap_remove(*uid_refs, UID_TO_PTR(uid)); + + if (c & DESTROY_IPC_FLAG) { + log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + } else { + c = n | (c & DESTROY_IPC_FLAG); + assert_se(hashmap_update(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0); + } +} + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) { + manager_unref_uid_internal(m, &m->uid_refs, uid, destroy_now, clean_ipc_by_uid); +} + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) { + manager_unref_uid_internal(m, &m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid); +} + +static int manager_ref_uid_internal( + Manager *m, + Hashmap **uid_refs, + uid_t uid, + bool clean_ipc) { + + uint32_t c, n; + int r; + + assert(m); + assert(uid_refs); + assert(uid_is_valid(uid)); + + /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the assumption + * that uid_t and gid_t are actually defined the same way, with the same validity rules. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return 0; + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) + return r; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + n++; + + if (n & DESTROY_IPC_FLAG) /* check for overflow */ + return -EOVERFLOW; + + c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0); + + return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); +} + +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) { + return manager_ref_uid_internal(m, &m->uid_refs, uid, clean_ipc); +} + +int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) { + return manager_ref_uid_internal(m, &m->gid_refs, (uid_t) gid, clean_ipc); +} + +static void manager_vacuum_uid_refs_internal( + Manager *m, + Hashmap **uid_refs, + int (*_clean_ipc)(uid_t uid)) { + + Iterator i; + void *p, *k; + + assert(m); + assert(uid_refs); + assert(_clean_ipc); + + HASHMAP_FOREACH_KEY(p, k, *uid_refs, i) { + uint32_t c, n; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + n = c & ~DESTROY_IPC_FLAG; + if (n > 0) + continue; + + if (c & DESTROY_IPC_FLAG) { + log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + + assert_se(hashmap_remove(*uid_refs, k) == p); + } +} + +void manager_vacuum_uid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m, &m->uid_refs, clean_ipc_by_uid); +} + +void manager_vacuum_gid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m, &m->gid_refs, clean_ipc_by_gid); +} + +static void manager_serialize_uid_refs_internal( + Manager *m, + FILE *f, + Hashmap **uid_refs, + const char *field_name) { + + Iterator i; + void *p, *k; + + assert(m); + assert(f); + assert(uid_refs); + assert(field_name); + + /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as the actual counter + * of it is better rebuild after a reload/reexec. */ + + HASHMAP_FOREACH_KEY(p, k, *uid_refs, i) { + uint32_t c; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + if (!(c & DESTROY_IPC_FLAG)) + continue; + + (void) serialize_item_format(f, field_name, UID_FMT, uid); + } +} + +void manager_serialize_uid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(m, f, &m->uid_refs, "destroy-ipc-uid"); +} + +void manager_serialize_gid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid"); +} + +static void manager_deserialize_uid_refs_one_internal( + Manager *m, + Hashmap** uid_refs, + const char *value) { + + uid_t uid; + uint32_t c; + int r; + + assert(m); + assert(uid_refs); + assert(value); + + r = parse_uid(value, &uid); + if (r < 0 || uid == 0) { + log_debug("Unable to parse UID reference serialization"); + return; + } + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) { + log_oom(); + return; + } + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + if (c & DESTROY_IPC_FLAG) + return; + + c |= DESTROY_IPC_FLAG; + + r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); + if (r < 0) { + log_debug_errno(r, "Failed to add UID reference entry: %m"); + return; + } +} + +void manager_deserialize_uid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(m, &m->uid_refs, value); +} + +void manager_deserialize_gid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(m, &m->gid_refs, value); +} + +int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + struct buffer { + uid_t uid; + gid_t gid; + char unit_name[UNIT_NAME_MAX+1]; + } _packed_ buffer; + + Manager *m = userdata; + ssize_t l; + size_t n; + Unit *u; + + assert_se(source); + assert_se(m); + + /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the resulting UID/GID + * in a datagram. We parse the datagram here and pass it off to the unit, so that it can add a reference to the + * UID/GID so that it can destroy the UID/GID's IPC objects when the reference counter drops to 0. */ + + l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT); + if (l < 0) { + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + return log_error_errno(errno, "Failed to read from user lookup fd: %m"); + } + + if ((size_t) l <= offsetof(struct buffer, unit_name)) { + log_warning("Received too short user lookup message, ignoring."); + return 0; + } + + if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) { + log_warning("Received too long user lookup message, ignoring."); + return 0; + } + + if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) { + log_warning("Got user lookup message with invalid UID/GID pair, ignoring."); + return 0; + } + + n = (size_t) l - offsetof(struct buffer, unit_name); + if (memchr(buffer.unit_name, 0, n)) { + log_warning("Received lookup message with embedded NUL character, ignoring."); + return 0; + } + + buffer.unit_name[n] = 0; + u = manager_get_unit(m, buffer.unit_name); + if (!u) { + log_debug("Got user lookup message but unit doesn't exist, ignoring."); + return 0; + } + + log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid); + + unit_notify_user_lookup(u, buffer.uid, buffer.gid); + return 0; +} + +char *manager_taint_string(Manager *m) { + _cleanup_free_ char *destination = NULL, *overflowuid = NULL, *overflowgid = NULL; + char *buf, *e; + int r; + + /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". + * Only things that are detected at runtime should be tagged + * here. For stuff that is set during compilation, emit a warning + * in the configuration phase. */ + + assert(m); + + buf = new(char, sizeof("split-usr:" + "cgroups-missing:" + "local-hwclock:" + "var-run-bad:" + "overflowuid-not-65534:" + "overflowgid-not-65534:")); + if (!buf) + return NULL; + + e = buf; + buf[0] = 0; + + if (m->taint_usr) + e = stpcpy(e, "split-usr:"); + + if (access("/proc/cgroups", F_OK) < 0) + e = stpcpy(e, "cgroups-missing:"); + + if (clock_is_localtime(NULL) > 0) + e = stpcpy(e, "local-hwclock:"); + + r = readlink_malloc("/var/run", &destination); + if (r < 0 || !PATH_IN_SET(destination, "../run", "/run")) + e = stpcpy(e, "var-run-bad:"); + + r = read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid); + if (r >= 0 && !streq(overflowuid, "65534")) + e = stpcpy(e, "overflowuid-not-65534:"); + + r = read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid); + if (r >= 0 && !streq(overflowgid, "65534")) + e = stpcpy(e, "overflowgid-not-65534:"); + + /* remove the last ':' */ + if (e != buf) + e[-1] = 0; + + return buf; +} + +void manager_ref_console(Manager *m) { + assert(m); + + m->n_on_console++; +} + +void manager_unref_console(Manager *m) { + + assert(m->n_on_console > 0); + m->n_on_console--; + + if (m->n_on_console == 0) + m->no_console_output = false; /* unset no_console_output flag, since the console is definitely free now */ +} + +void manager_override_log_level(Manager *m, int level) { + _cleanup_free_ char *s = NULL; + assert(m); + + if (!m->log_level_overridden) { + m->original_log_level = log_get_max_level(); + m->log_level_overridden = true; + } + + (void) log_level_to_string_alloc(level, &s); + log_info("Setting log level to %s.", strna(s)); + + log_set_max_level(level); +} + +void manager_restore_original_log_level(Manager *m) { + _cleanup_free_ char *s = NULL; + assert(m); + + if (!m->log_level_overridden) + return; + + (void) log_level_to_string_alloc(m->original_log_level, &s); + log_info("Restoring log level to original (%s).", strna(s)); + + log_set_max_level(m->original_log_level); + m->log_level_overridden = false; +} + +void manager_override_log_target(Manager *m, LogTarget target) { + assert(m); + + if (!m->log_target_overridden) { + m->original_log_target = log_get_target(); + m->log_target_overridden = true; + } + + log_info("Setting log target to %s.", log_target_to_string(target)); + log_set_target(target); +} + +void manager_restore_original_log_target(Manager *m) { + assert(m); + + if (!m->log_target_overridden) + return; + + log_info("Restoring log target to original %s.", log_target_to_string(m->original_log_target)); + + log_set_target(m->original_log_target); + m->log_target_overridden = false; +} + +ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s) { + if (in_initrd() && + s >= MANAGER_TIMESTAMP_SECURITY_START && + s <= MANAGER_TIMESTAMP_UNITS_LOAD_FINISH) + return s - MANAGER_TIMESTAMP_SECURITY_START + MANAGER_TIMESTAMP_INITRD_SECURITY_START; + return s; +} + +static const char *const manager_state_table[_MANAGER_STATE_MAX] = { + [MANAGER_INITIALIZING] = "initializing", + [MANAGER_STARTING] = "starting", + [MANAGER_RUNNING] = "running", + [MANAGER_DEGRADED] = "degraded", + [MANAGER_MAINTENANCE] = "maintenance", + [MANAGER_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState); + +static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { + [MANAGER_TIMESTAMP_FIRMWARE] = "firmware", + [MANAGER_TIMESTAMP_LOADER] = "loader", + [MANAGER_TIMESTAMP_KERNEL] = "kernel", + [MANAGER_TIMESTAMP_INITRD] = "initrd", + [MANAGER_TIMESTAMP_USERSPACE] = "userspace", + [MANAGER_TIMESTAMP_FINISH] = "finish", + [MANAGER_TIMESTAMP_SECURITY_START] = "security-start", + [MANAGER_TIMESTAMP_SECURITY_FINISH] = "security-finish", + [MANAGER_TIMESTAMP_GENERATORS_START] = "generators-start", + [MANAGER_TIMESTAMP_GENERATORS_FINISH] = "generators-finish", + [MANAGER_TIMESTAMP_UNITS_LOAD_START] = "units-load-start", + [MANAGER_TIMESTAMP_UNITS_LOAD_FINISH] = "units-load-finish", + [MANAGER_TIMESTAMP_INITRD_SECURITY_START] = "initrd-security-start", + [MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH] = "initrd-security-finish", + [MANAGER_TIMESTAMP_INITRD_GENERATORS_START] = "initrd-generators-start", + [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish", + [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start", + [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp); diff --git a/src/core/manager.h b/src/core/manager.h new file mode 100644 index 0000000..bce8020 --- /dev/null +++ b/src/core/manager.h @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> +#include <stdio.h> + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-event.h" + +#include "cgroup-util.h" +#include "fdset.h" +#include "hashmap.h" +#include "ip-address-access.h" +#include "list.h" +#include "ratelimit.h" + +struct libmnt_monitor; +typedef struct Unit Unit; + +/* Enforce upper limit how many names we allow */ +#define MANAGER_MAX_NAMES 131072 /* 128K */ + +typedef struct Manager Manager; + +/* An externally visible state. We don't actually maintain this as state variable, but derive it from various fields + * when requested */ +typedef enum ManagerState { + MANAGER_INITIALIZING, + MANAGER_STARTING, + MANAGER_RUNNING, + MANAGER_DEGRADED, + MANAGER_MAINTENANCE, + MANAGER_STOPPING, + _MANAGER_STATE_MAX, + _MANAGER_STATE_INVALID = -1 +} ManagerState; + +typedef enum ManagerObjective { + MANAGER_OK, + MANAGER_EXIT, + MANAGER_RELOAD, + MANAGER_REEXECUTE, + MANAGER_REBOOT, + MANAGER_POWEROFF, + MANAGER_HALT, + MANAGER_KEXEC, + MANAGER_SWITCH_ROOT, + _MANAGER_OBJECTIVE_MAX, + _MANAGER_OBJECTIVE_INVALID = -1 +} ManagerObjective; + +typedef enum StatusType { + STATUS_TYPE_EPHEMERAL, + STATUS_TYPE_NORMAL, + STATUS_TYPE_EMERGENCY, +} StatusType; + +/* Notes: + * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD, + * TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when + * the manager is system and not running under container environment. + * + * 2. The monotonic timestamp of TIMESTAMP_KERNEL is always zero. + * + * 3. The realtime timestamp of TIMESTAMP_KERNEL will be unset if the system does not + * have RTC. + * + * 4. TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER will be unset if the system does not + * have RTC, or systemd is built without EFI support. + * + * 5. The monotonic timestamps of TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER are stored as + * negative of the actual value. + * + * 6. TIMESTAMP_USERSPACE is the timestamp of when the manager was started. + * + * 7. TIMESTAMP_INITRD_* are set only when the system is booted with an initrd. + */ + +typedef enum ManagerTimestamp { + MANAGER_TIMESTAMP_FIRMWARE, + MANAGER_TIMESTAMP_LOADER, + MANAGER_TIMESTAMP_KERNEL, + MANAGER_TIMESTAMP_INITRD, + MANAGER_TIMESTAMP_USERSPACE, + MANAGER_TIMESTAMP_FINISH, + + MANAGER_TIMESTAMP_SECURITY_START, + MANAGER_TIMESTAMP_SECURITY_FINISH, + MANAGER_TIMESTAMP_GENERATORS_START, + MANAGER_TIMESTAMP_GENERATORS_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD_START, + MANAGER_TIMESTAMP_UNITS_LOAD_FINISH, + + MANAGER_TIMESTAMP_INITRD_SECURITY_START, + MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH, + MANAGER_TIMESTAMP_INITRD_GENERATORS_START, + MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH, + MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START, + MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH, + _MANAGER_TIMESTAMP_MAX, + _MANAGER_TIMESTAMP_INVALID = -1, +} ManagerTimestamp; + +#include "execute.h" +#include "job.h" +#include "path-lookup.h" +#include "show-status.h" +#include "unit-name.h" + +typedef enum ManagerTestRunFlags { + MANAGER_TEST_NORMAL = 0, /* run normally */ + MANAGER_TEST_RUN_MINIMAL = 1 << 0, /* create basic data structures */ + MANAGER_TEST_RUN_BASIC = 1 << 1, /* interact with the environment */ + MANAGER_TEST_RUN_ENV_GENERATORS = 1 << 2, /* also run env generators */ + MANAGER_TEST_RUN_GENERATORS = 1 << 3, /* also run unit generators */ + MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS, +} ManagerTestRunFlags; + +assert_cc((MANAGER_TEST_FULL & UINT8_MAX) == MANAGER_TEST_FULL); + +struct Manager { + /* Note that the set of units we know of is allowed to be + * inconsistent. However the subset of it that is loaded may + * not, and the list of jobs may neither. */ + + /* Active jobs and units */ + Hashmap *units; /* name string => Unit object n:1 */ + Hashmap *units_by_invocation_id; + Hashmap *jobs; /* job id => Job object 1:1 */ + + /* To make it easy to iterate through the units of a specific + * type we maintain a per type linked list */ + LIST_HEAD(Unit, units_by_type[_UNIT_TYPE_MAX]); + + /* Units that need to be loaded */ + LIST_HEAD(Unit, load_queue); /* this is actually more a stack than a queue, but uh. */ + + /* Jobs that need to be run */ + LIST_HEAD(Job, run_queue); /* more a stack than a queue, too */ + + /* Units and jobs that have not yet been announced via + * D-Bus. When something about a job changes it is added here + * if it is not in there yet. This allows easy coalescing of + * D-Bus change signals. */ + LIST_HEAD(Unit, dbus_unit_queue); + LIST_HEAD(Job, dbus_job_queue); + + /* Units to remove */ + LIST_HEAD(Unit, cleanup_queue); + + /* Units and jobs to check when doing GC */ + LIST_HEAD(Unit, gc_unit_queue); + LIST_HEAD(Job, gc_job_queue); + + /* Units that should be realized */ + LIST_HEAD(Unit, cgroup_realize_queue); + + /* Units whose cgroup ran empty */ + LIST_HEAD(Unit, cgroup_empty_queue); + + /* Target units whose default target dependencies haven't been set yet */ + LIST_HEAD(Unit, target_deps_queue); + + /* Units that might be subject to StopWhenUnneeded= clean-up */ + LIST_HEAD(Unit, stop_when_unneeded_queue); + + sd_event *event; + + /* This maps PIDs we care about to units that are interested in. We allow multiple units to he interested in + * the same PID and multiple PIDs to be relevant to the same unit. Since in most cases only a single unit will + * be interested in the same PID we use a somewhat special encoding here: the first unit interested in a PID is + * stored directly in the hashmap, keyed by the PID unmodified. If there are other units interested too they'll + * be stored in a NULL-terminated array, and keyed by the negative PID. This is safe as pid_t is signed and + * negative PIDs are not used for regular processes but process groups, which we don't care about in this + * context, but this allows us to use the negative range for our own purposes. */ + Hashmap *watch_pids; /* pid => unit as well as -pid => array of units */ + + /* A set contains all units which cgroup should be refreshed after startup */ + Set *startup_units; + + /* A set which contains all currently failed units */ + Set *failed_units; + + sd_event_source *run_queue_event_source; + + char *notify_socket; + int notify_fd; + sd_event_source *notify_event_source; + + int cgroups_agent_fd; + sd_event_source *cgroups_agent_event_source; + + int signal_fd; + sd_event_source *signal_event_source; + + sd_event_source *sigchld_event_source; + + int time_change_fd; + sd_event_source *time_change_event_source; + + sd_event_source *timezone_change_event_source; + + sd_event_source *jobs_in_progress_event_source; + + int user_lookup_fds[2]; + sd_event_source *user_lookup_event_source; + + sd_event_source *sync_bus_names_event_source; + + UnitFileScope unit_file_scope; + LookupPaths lookup_paths; + Set *unit_path_cache; + + char **transient_environment; /* The environment, as determined from config files, kernel cmdline and environment generators */ + char **client_environment; /* Environment variables created by clients through the bus API */ + + usec_t runtime_watchdog; + usec_t shutdown_watchdog; + + dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX]; + + /* Data specific to the device subsystem */ + sd_device_monitor *device_monitor; + Hashmap *devices_by_sysfs; + + /* Data specific to the mount subsystem */ + struct libmnt_monitor *mount_monitor; + sd_event_source *mount_event_source; + + /* Data specific to the swap filesystem */ + FILE *proc_swaps; + sd_event_source *swap_event_source; + Hashmap *swaps_by_devnode; + + /* Data specific to the D-Bus subsystem */ + sd_bus *api_bus, *system_bus; + Set *private_buses; + int private_listen_fd; + sd_event_source *private_listen_event_source; + + /* Contains all the clients that are subscribed to signals via + the API bus. Note that private bus connections are always + considered subscribes, since they last for very short only, + and it is much simpler that way. */ + sd_bus_track *subscribed; + char **deserialized_subscribed; + + /* This is used during reloading: before the reload we queue + * the reply message here, and afterwards we send it */ + sd_bus_message *pending_reload_message; + + Hashmap *watch_bus; /* D-Bus names => Unit object n:1 */ + + bool send_reloading_done; + + uint32_t current_job_id; + uint32_t default_unit_job_id; + + /* Data specific to the Automount subsystem */ + int dev_autofs_fd; + + /* Data specific to the cgroup subsystem */ + Hashmap *cgroup_unit; + CGroupMask cgroup_supported; + char *cgroup_root; + + /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */ + int cgroup_inotify_fd; + sd_event_source *cgroup_inotify_event_source; + Hashmap *cgroup_inotify_wd_unit; + + /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */ + sd_event_source *cgroup_empty_event_source; + + /* Make sure the user cannot accidentally unmount our cgroup + * file system */ + int pin_cgroupfs_fd; + + unsigned gc_marker; + + /* The stat() data the last time we saw /etc/localtime */ + usec_t etc_localtime_mtime; + bool etc_localtime_accessible:1; + + ManagerObjective objective:5; + + /* Flags */ + bool dispatching_load_queue:1; + + bool taint_usr:1; + + /* Have we already sent out the READY=1 notification? */ + bool ready_sent:1; + + /* Have we already printed the taint line if necessary? */ + bool taint_logged:1; + + /* Have we ever changed the "kernel.pid_max" sysctl? */ + bool sysctl_pid_max_changed:1; + + ManagerTestRunFlags test_run_flags:8; + + /* If non-zero, exit with the following value when the systemd + * process terminate. Useful for containers: systemd-nspawn could get + * the return value. */ + uint8_t return_value; + + ShowStatus show_status; + char *confirm_spawn; + bool no_console_output; + bool service_watchdogs; + + ExecOutput default_std_output, default_std_error; + + usec_t default_restart_usec, default_timeout_start_usec, default_timeout_stop_usec; + + usec_t default_start_limit_interval; + unsigned default_start_limit_burst; + + bool default_cpu_accounting; + bool default_memory_accounting; + bool default_io_accounting; + bool default_blockio_accounting; + bool default_tasks_accounting; + bool default_ip_accounting; + + uint64_t default_tasks_max; + usec_t default_timer_accuracy_usec; + + int original_log_level; + LogTarget original_log_target; + bool log_level_overridden:1; + bool log_target_overridden:1; + + struct rlimit *rlimit[_RLIMIT_MAX]; + + /* non-zero if we are reloading or reexecuting, */ + int n_reloading; + + unsigned n_installed_jobs; + unsigned n_failed_jobs; + + /* Jobs in progress watching */ + unsigned n_running_jobs; + unsigned n_on_console; + unsigned jobs_in_progress_iteration; + + /* Do we have any outstanding password prompts? */ + int have_ask_password; + int ask_password_inotify_fd; + sd_event_source *ask_password_event_source; + + /* Type=idle pipes */ + int idle_pipe[4]; + sd_event_source *idle_pipe_event_source; + + char *switch_root; + char *switch_root_init; + + /* This maps all possible path prefixes to the units needing + * them. It's a hashmap with a path string as key and a Set as + * value where Unit objects are contained. */ + Hashmap *units_requiring_mounts_for; + + /* Used for processing polkit authorization responses */ + Hashmap *polkit_registry; + + /* Dynamic users/groups, indexed by their name */ + Hashmap *dynamic_users; + + /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */ + Hashmap *uid_refs; + Hashmap *gid_refs; + + /* ExecRuntime, indexed by their owner unit id */ + Hashmap *exec_runtime_by_id; + + /* When the user hits C-A-D more than 7 times per 2s, do something immediately... */ + RateLimit ctrl_alt_del_ratelimit; + EmergencyAction cad_burst_action; + + const char *unit_log_field; + const char *unit_log_format_string; + + const char *invocation_log_field; + const char *invocation_log_format_string; + + int first_boot; /* tri-state */ + + /* Prefixes of e.g. RuntimeDirectory= */ + char *prefix[_EXEC_DIRECTORY_TYPE_MAX]; + + /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; +}; + +#define MANAGER_IS_SYSTEM(m) ((m)->unit_file_scope == UNIT_FILE_SYSTEM) +#define MANAGER_IS_USER(m) ((m)->unit_file_scope != UNIT_FILE_SYSTEM) + +#define MANAGER_IS_RELOADING(m) ((m)->n_reloading > 0) + +#define MANAGER_IS_FINISHED(m) (dual_timestamp_is_set((m)->timestamps + MANAGER_TIMESTAMP_FINISH)) + +/* The objective is set to OK as soon as we enter the main loop, and set otherwise as soon as we are done with it */ +#define MANAGER_IS_RUNNING(m) ((m)->objective == MANAGER_OK) + +#define MANAGER_IS_TEST_RUN(m) ((m)->test_run_flags != 0) + +int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager **m); +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_startup(Manager *m, FILE *serialization, FDSet *fds); + +Job *manager_get_job(Manager *m, uint32_t id); +Unit *manager_get_unit(Manager *m, const char *name); + +int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j); + +int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **_ret); +int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **_ret); +int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret); +int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u); + +int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, sd_bus_error *e, Job **_ret); +int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, sd_bus_error *e, Job **_ret); +int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Job **ret); +int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e); + +void manager_dump_units(Manager *s, FILE *f, const char *prefix); +void manager_dump_jobs(Manager *s, FILE *f, const char *prefix); +void manager_dump(Manager *s, FILE *f, const char *prefix); +int manager_get_dump_string(Manager *m, char **ret); + +void manager_clear_jobs(Manager *m); + +unsigned manager_dispatch_load_queue(Manager *m); + +int manager_default_environment(Manager *m); +int manager_transient_environment_add(Manager *m, char **plus); +int manager_client_environment_modify(Manager *m, char **minus, char **plus); +int manager_get_effective_environment(Manager *m, char ***ret); + +int manager_set_default_rlimits(Manager *m, struct rlimit **default_rlimit); + +int manager_loop(Manager *m); + +int manager_open_serialization(Manager *m, FILE **_f); + +int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root); +int manager_deserialize(Manager *m, FILE *f, FDSet *fds); + +int manager_reload(Manager *m); + +void manager_reset_failed(Manager *m); + +void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success); +void manager_send_unit_plymouth(Manager *m, Unit *u); + +bool manager_unit_inactive_or_pending(Manager *m, const char *name); + +void manager_check_finished(Manager *m); + +void manager_recheck_dbus(Manager *m); +void manager_recheck_journal(Manager *m); + +void manager_set_show_status(Manager *m, ShowStatus mode); +void manager_set_first_boot(Manager *m, bool b); + +void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5); +void manager_flip_auto_status(Manager *m, bool enable); + +Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path); + +ManagerState manager_state(Manager *m); + +int manager_update_failed_units(Manager *m, Unit *u, bool failed); + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now); +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc); + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now); +int manager_ref_gid(Manager *m, gid_t gid, bool destroy_now); + +void manager_vacuum_uid_refs(Manager *m); +void manager_vacuum_gid_refs(Manager *m); + +void manager_serialize_uid_refs(Manager *m, FILE *f); +void manager_deserialize_uid_refs_one(Manager *m, const char *value); + +void manager_serialize_gid_refs(Manager *m, FILE *f); +void manager_deserialize_gid_refs_one(Manager *m, const char *value); + +char *manager_taint_string(Manager *m); + +void manager_ref_console(Manager *m); +void manager_unref_console(Manager *m); + +void manager_override_log_level(Manager *m, int level); +void manager_restore_original_log_level(Manager *m); + +void manager_override_log_target(Manager *m, LogTarget target); +void manager_restore_original_log_target(Manager *m); + +const char *manager_state_to_string(ManagerState m) _const_; +ManagerState manager_state_from_string(const char *s) _pure_; + +const char *manager_get_confirm_spawn(Manager *m); +bool manager_is_confirm_spawn_disabled(Manager *m); +void manager_disable_confirm_spawn(void); + +const char *manager_timestamp_to_string(ManagerTimestamp m) _const_; +ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_; +ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s); diff --git a/src/core/meson.build b/src/core/meson.build new file mode 100644 index 0000000..85021bd --- /dev/null +++ b/src/core/meson.build @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: LGPL-2.1+ + +libcore_la_sources = ''' + audit-fd.c + audit-fd.h + automount.c + automount.h + bpf-devices.c + bpf-devices.h + bpf-firewall.c + bpf-firewall.h + cgroup.c + cgroup.h + chown-recursive.c + chown-recursive.h + dbus-automount.c + dbus-automount.h + dbus-cgroup.c + dbus-cgroup.h + dbus-device.c + dbus-device.h + dbus-execute.c + dbus-execute.h + dbus-job.c + dbus-job.h + dbus-kill.c + dbus-kill.h + dbus-manager.c + dbus-manager.h + dbus-mount.c + dbus-mount.h + dbus-path.c + dbus-path.h + dbus-scope.c + dbus-scope.h + dbus-service.c + dbus-service.h + dbus-slice.c + dbus-slice.h + dbus-socket.c + dbus-socket.h + dbus-swap.c + dbus-swap.h + dbus-target.c + dbus-target.h + dbus-timer.c + dbus-timer.h + dbus-unit.c + dbus-unit.h + dbus-util.c + dbus-util.h + dbus.c + dbus.h + device.c + device.h + dynamic-user.c + dynamic-user.h + emergency-action.c + emergency-action.h + execute.c + execute.h + hostname-setup.c + hostname-setup.h + ima-setup.c + ima-setup.h + ip-address-access.c + ip-address-access.h + job.c + job.h + kill.c + kill.h + killall.c + killall.h + kmod-setup.c + kmod-setup.h + load-dropin.c + load-dropin.h + load-fragment.c + load-fragment.h + locale-setup.c + locale-setup.h + loopback-setup.c + loopback-setup.h + machine-id-setup.c + machine-id-setup.h + manager.c + manager.h + mount-setup.c + mount-setup.h + mount.c + mount.h + namespace.c + namespace.h + path.c + path.h + scope.c + scope.h + selinux-access.c + selinux-access.h + selinux-setup.c + selinux-setup.h + service.c + service.h + show-status.c + show-status.h + slice.c + slice.h + smack-setup.c + smack-setup.h + socket.c + socket.h + swap.c + swap.h + target.c + target.h + timer.c + timer.h + transaction.c + transaction.h + unit-printf.c + unit-printf.h + unit.c + unit.h +'''.split() + +load_fragment_gperf_gperf = custom_target( + 'load-fragment-gperf.gperf', + input : 'load-fragment-gperf.gperf.m4', + output: 'load-fragment-gperf.gperf', + command : [meson_apply_m4, config_h, '@INPUT@'], + capture : true) + +load_fragment_gperf_c = custom_target( + 'load-fragment-gperf.c', + input : load_fragment_gperf_gperf, + output : 'load-fragment-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +awkscript = 'load-fragment-gperf-nulstr.awk' +load_fragment_gperf_nulstr_c = custom_target( + 'load-fragment-gperf-nulstr.c', + input : [awkscript, load_fragment_gperf_gperf], + output : 'load-fragment-gperf-nulstr.c', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +libcore = static_library( + 'core', + libcore_la_sources, + load_fragment_gperf_c, + load_fragment_gperf_nulstr_c, + include_directories : includes, + dependencies : [threads, + librt, + libseccomp, + libpam, + libaudit, + libkmod, + libapparmor, + libselinux, + libmount]) + +systemd_sources = files('main.c') + +systemd_shutdown_sources = files(''' + shutdown.c + umount.c + umount.h + mount-setup.c + mount-setup.h + killall.c + killall.h +'''.split()) + +in_files = [['macros.systemd', rpmmacrosdir], + ['system.conf', pkgsysconfdir], + ['systemd.pc', pkgconfigdatadir], + ['triggers.systemd', '']] + +foreach item : in_files + file = item[0] + dir = item[1] + + configure_file( + input : file + '.in', + output : file, + configuration : substs, + install_dir : dir == 'no' ? '' : dir) +endforeach + +install_data('org.freedesktop.systemd1.conf', + install_dir : dbuspolicydir) +install_data('org.freedesktop.systemd1.service', + install_dir : dbussystemservicedir) + +policy = configure_file( + input : 'org.freedesktop.systemd1.policy.in', + output : 'org.freedesktop.systemd1.policy', + configuration : substs) +install_data(policy, + install_dir : polkitpolicydir) + +install_data('user.conf', + install_dir : pkgsysconfdir) + +meson.add_install_script('sh', '-c', mkdir_p.format(systemshutdowndir)) +meson.add_install_script('sh', '-c', mkdir_p.format(systemsleepdir)) +meson.add_install_script('sh', '-c', mkdir_p.format(systemgeneratordir)) +meson.add_install_script('sh', '-c', mkdir_p.format(usergeneratordir)) + +meson.add_install_script('sh', '-c', + mkdir_p.format(join_paths(pkgsysconfdir, 'system/multi-user.target.wants'))) +meson.add_install_script('sh', '-c', + mkdir_p.format(join_paths(pkgsysconfdir, 'system/getty.target.wants'))) +meson.add_install_script('sh', '-c', + mkdir_p.format(join_paths(pkgsysconfdir, 'user'))) +meson.add_install_script('sh', '-c', + mkdir_p.format(join_paths(sysconfdir, 'xdg/systemd'))) diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c new file mode 100644 index 0000000..3ce6164 --- /dev/null +++ b/src/core/mount-setup.c @@ -0,0 +1,566 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <ftw.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/statvfs.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "dev-setup.h" +#include "dirent-util.h" +#include "efivars.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "label.h" +#include "log.h" +#include "macro.h" +#include "missing.h" +#include "mkdir.h" +#include "mount-setup.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "set.h" +#include "smack-util.h" +#include "strv.h" +#include "user-util.h" +#include "util.h" +#include "virt.h" + +typedef enum MountMode { + MNT_NONE = 0, + MNT_FATAL = 1 << 0, + MNT_IN_CONTAINER = 1 << 1, + MNT_CHECK_WRITABLE = 1 << 2, +} MountMode; + +typedef struct MountPoint { + const char *what; + const char *where; + const char *type; + const char *options; + unsigned long flags; + bool (*condition_fn)(void); + MountMode mode; +} MountPoint; + +/* The first three entries we might need before SELinux is up. The + * fourth (securityfs) is needed by IMA to load a custom policy. The + * other ones we can delay until SELinux and IMA are loaded. When + * SMACK is enabled we need smackfs, too, so it's a fifth one. */ +#if ENABLE_SMACK +#define N_EARLY_MOUNT 5 +#else +#define N_EARLY_MOUNT 4 +#endif + +static const MountPoint mount_table[] = { + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE }, +#if ENABLE_SMACK + { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, + mac_smack_use, MNT_FATAL }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + mac_smack_use, MNT_FATAL }, +#endif + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, + NULL, MNT_IN_CONTAINER }, +#if ENABLE_SMACK + { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + mac_smack_use, MNT_FATAL }, +#endif + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_legacy_wanted, MNT_IN_CONTAINER }, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, + { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE }, +#if ENABLE_EFI + { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + is_efi_boot, MNT_NONE }, +#endif + { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE, }, +}; + +/* These are API file systems that might be mounted by other software, + * we just list them here so that we know that we should ignore them */ + +static const char ignore_paths[] = + /* SELinux file systems */ + "/sys/fs/selinux\0" + /* Container bind mounts */ + "/proc/sys\0" + "/dev/console\0" + "/proc/kmsg\0"; + +bool mount_point_is_api(const char *path) { + unsigned i; + + /* Checks if this mount point is considered "API", and hence + * should be ignored */ + + for (i = 0; i < ELEMENTSOF(mount_table); i ++) + if (path_equal(path, mount_table[i].where)) + return true; + + return path_startswith(path, "/sys/fs/cgroup/"); +} + +bool mount_point_ignore(const char *path) { + const char *i; + + NULSTR_FOREACH(i, ignore_paths) + if (path_equal(path, i)) + return true; + + return false; +} + +static int mount_one(const MountPoint *p, bool relabel) { + int r, priority; + + assert(p); + + priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG; + + if (p->condition_fn && !p->condition_fn()) + return 0; + + /* Relabel first, just in case */ + if (relabel) + (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS); + + r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW); + if (r < 0 && r != -ENOENT) { + log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + if (r > 0) + return 0; + + /* Skip securityfs in a container */ + if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0) + return 0; + + /* The access mode here doesn't really matter too much, since + * the mounted file system will take precedence anyway. */ + if (relabel) + (void) mkdir_p_label(p->where, 0755); + else + (void) mkdir_p(p->where, 0755); + + log_debug("Mounting %s to %s of type %s with options %s.", + p->what, + p->where, + p->type, + strna(p->options)); + + if (mount(p->what, + p->where, + p->type, + p->flags, + p->options) < 0) { + log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where); + return (p->mode & MNT_FATAL) ? -errno : 0; + } + + /* Relabel again, since we now mounted something fresh here */ + if (relabel) + (void) label_fix(p->where, 0); + + if (p->mode & MNT_CHECK_WRITABLE) { + if (access(p->where, W_OK) < 0) { + r = -errno; + + (void) umount(p->where); + (void) rmdir(p->where); + + log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + } + + return 1; +} + +static int mount_points_setup(unsigned n, bool loaded_policy) { + unsigned i; + int r = 0; + + for (i = 0; i < n; i ++) { + int j; + + j = mount_one(mount_table + i, loaded_policy); + if (j != 0 && r >= 0) + r = j; + } + + return r; +} + +int mount_setup_early(void) { + assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table)); + + /* Do a minimal mount of /proc and friends to enable the most + * basic stuff, such as SELinux */ + return mount_points_setup(N_EARLY_MOUNT, false); +} + +static const char *join_with(const char *controller) { + + static const char* const pairs[] = { + "cpu", "cpuacct", + "net_cls", "net_prio", + NULL + }; + + const char *const *x, *const *y; + + assert(controller); + + /* This will lookup which controller to mount another controller with. Input is a controller name, and output + * is the other controller name. The function works both ways: you can input one and get the other, and input + * the other to get the one. */ + + STRV_FOREACH_PAIR(x, y, pairs) { + if (streq(controller, *x)) + return *y; + if (streq(controller, *y)) + return *x; + } + + return NULL; +} + +static int symlink_controller(const char *target, const char *alias) { + const char *a; + int r; + + assert(target); + assert(alias); + + a = strjoina("/sys/fs/cgroup/", alias); + + r = symlink_idempotent(target, a, false); + if (r < 0) + return log_error_errno(r, "Failed to create symlink %s: %m", a); + +#ifdef SMACK_RUN_LABEL + const char *p; + + p = strjoina("/sys/fs/cgroup/", target); + + r = mac_smack_copy(a, p); + if (r < 0 && r != -EOPNOTSUPP) + return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a); +#endif + + return 0; +} + +int mount_cgroup_controllers(void) { + _cleanup_set_free_free_ Set *controllers = NULL; + int r; + + if (!cg_is_legacy_wanted()) + return 0; + + /* Mount all available cgroup controllers that are built into the kernel. */ + r = cg_kernel_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to enumerate cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL; + const char *other_controller; + MountPoint p = { + .what = "cgroup", + .type = "cgroup", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_IN_CONTAINER, + }; + + controller = set_steal_first(controllers); + if (!controller) + break; + + /* Check if we shall mount this together with another controller */ + other_controller = join_with(controller); + if (other_controller) { + _cleanup_free_ char *c = NULL; + + /* Check if the other controller is actually available in the kernel too */ + c = set_remove(controllers, other_controller); + if (c) { + + /* Join the two controllers into one string, and maintain a stable ordering */ + if (strcmp(controller, other_controller) < 0) + options = strjoin(controller, ",", other_controller); + else + options = strjoin(other_controller, ",", controller); + if (!options) + return log_oom(); + } + } + + /* The simple case, where there's only one controller to mount together */ + if (!options) + options = TAKE_PTR(controller); + + where = strappend("/sys/fs/cgroup/", options); + if (!where) + return log_oom(); + + p.where = where; + p.options = options; + + r = mount_one(&p, true); + if (r < 0) + return r; + + /* Create symlinks from the individual controller names, in case we have a joined mount */ + if (controller) + (void) symlink_controller(options, controller); + if (other_controller) + (void) symlink_controller(options, other_controller); + } + + /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */ + (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); + + return 0; +} + +#if HAVE_SELINUX || ENABLE_SMACK +static int nftw_cb( + const char *fpath, + const struct stat *sb, + int tflag, + struct FTW *ftwbuf) { + + /* No need to label /dev twice in a row... */ + if (_unlikely_(ftwbuf->level == 0)) + return FTW_CONTINUE; + + (void) label_fix(fpath, 0); + + /* /run/initramfs is static data and big, no need to + * dynamically relabel its contents at boot... */ + if (_unlikely_(ftwbuf->level == 1 && + tflag == FTW_D && + streq(fpath, "/run/initramfs"))) + return FTW_SKIP_SUBTREE; + + return FTW_CONTINUE; +}; + +static int relabel_cgroup_filesystems(void) { + int r; + struct statfs st; + + r = cg_all_unified(); + if (r == 0) { + /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this + only when the filesystem has been already populated by a previous instance of systemd + running from initrd. Otherwise don't remount anything and leave the filesystem read-write + for the cgroup filesystems to be mounted inside. */ + if (statfs("/sys/fs/cgroup", &st) < 0) + return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m"); + + if (st.f_flags & ST_RDONLY) + (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL); + + (void) label_fix("/sys/fs/cgroup", 0); + (void) nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL); + + if (st.f_flags & ST_RDONLY) + (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL); + + } else if (r < 0) + return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m"); + + return 0; +} + +static int relabel_extra(void) { + _cleanup_closedir_ DIR *d = NULL; + int r, c = 0; + + /* Support for relabelling additional files or directories after loading the policy. For this, code in the + * initrd simply has to drop in *.relabel files into /run/systemd/relabel-extra.d/. We'll read all such files + * expecting one absolute path by line and will relabel each (and everyone below that in case the path refers + * to a directory). These drop-in files are supposed to be absolutely minimal, and do not understand comments + * and such. After the operation succeeded the files are removed, and the drop-in directory as well, if + * possible. + */ + + d = opendir("/run/systemd/relabel-extra.d/"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open /run/systemd/relabel-extra.d/, ignoring: %m"); + } + + for (;;) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -1; + struct dirent *de; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + return log_error_errno(errno, "Failed read directory /run/systemd/relabel-extra.d/, ignoring: %m"); + break; + } + + if (hidden_or_backup_file(de->d_name)) + continue; + + if (!endswith(de->d_name, ".relabel")) + continue; + + if (!IN_SET(de->d_type, DT_REG, DT_UNKNOWN)) + continue; + + fd = openat(dirfd(d), de->d_name, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) { + log_warning_errno(errno, "Failed to open /run/systemd/relabel-extra.d/%s, ignoring: %m", de->d_name); + continue; + } + + f = fdopen(fd, "r"); + if (!f) { + log_warning_errno(errno, "Failed to convert file descriptor into file object, ignoring: %m"); + continue; + } + TAKE_FD(fd); + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) { + log_warning_errno(r, "Failed to read from /run/systemd/relabel-extra.d/%s, ignoring: %m", de->d_name); + break; + } + if (r == 0) /* EOF */ + break; + + path_simplify(line, true); + + if (!path_is_normalized(line)) { + log_warning("Path to relabel is not normalized, ignoring: %s", line); + continue; + } + + if (!path_is_absolute(line)) { + log_warning("Path to relabel is not absolute, ignoring: %s", line); + continue; + } + + log_debug("Relabelling additional file/directory '%s'.", line); + (void) nftw(line, nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL); + c++; + } + + if (unlinkat(dirfd(d), de->d_name, 0) < 0) + log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/%s, ignoring: %m", de->d_name); + } + + /* Remove when we completing things. */ + if (rmdir("/run/systemd/relabel-extra.d") < 0) + log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/ directory: %m"); + + return c; +} +#endif + +int mount_setup(bool loaded_policy) { + int r = 0; + + r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy); + if (r < 0) + return r; + +#if HAVE_SELINUX || ENABLE_SMACK + /* Nodes in devtmpfs and /run need to be manually updated for + * the appropriate labels, after mounting. The other virtual + * API file systems like /sys and /proc do not need that, they + * use the same label for all their files. */ + if (loaded_policy) { + usec_t before_relabel, after_relabel; + char timespan[FORMAT_TIMESPAN_MAX]; + const char *i; + int n_extra; + + before_relabel = now(CLOCK_MONOTONIC); + + FOREACH_STRING(i, "/dev", "/dev/shm", "/run") + (void) nftw(i, nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL); + + (void) relabel_cgroup_filesystems(); + + n_extra = relabel_extra(); + + after_relabel = now(CLOCK_MONOTONIC); + + log_info("Relabelled /dev, /dev/shm, /run, /sys/fs/cgroup%s in %s.", + n_extra > 0 ? ", additional files" : "", + format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0)); + } +#endif + + /* Create a few default symlinks, which are normally created + * by udevd, but some scripts might need them before we start + * udevd. */ + dev_setup(NULL, UID_INVALID, GID_INVALID); + + /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we + * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of + * the box. If specific setups need other settings they can reset the propagation mode to private if + * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a + * container manager we assume the container manager knows what it is doing (for example, because it set up + * some directories with different propagation modes). */ + if (detect_container() <= 0) + if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0) + log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m"); + + /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so + * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will + * misdetect systemd. */ + (void) mkdir_label("/run/systemd", 0755); + (void) mkdir_label("/run/systemd/system", 0755); + + /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount inaccessible nodes + * from. */ + (void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID); + + return 0; +} diff --git a/src/core/mount-setup.h b/src/core/mount-setup.h new file mode 100644 index 0000000..b4ca2cf --- /dev/null +++ b/src/core/mount-setup.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> + +int mount_setup_early(void); +int mount_setup(bool loaded_policy); + +int mount_cgroup_controllers(void); + +bool mount_point_is_api(const char *path); +bool mount_point_ignore(const char *path); diff --git a/src/core/mount.c b/src/core/mount.c new file mode 100644 index 0000000..c31cad6 --- /dev/null +++ b/src/core/mount.c @@ -0,0 +1,2043 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <sys/epoll.h> + +#include <libmount.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "dbus-mount.h" +#include "dbus-unit.h" +#include "device.h" +#include "escape.h" +#include "exit-status.h" +#include "format-util.h" +#include "fstab-util.h" +#include "log.h" +#include "manager.h" +#include "mkdir.h" +#include "mount-setup.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +#define RETRY_UMOUNT_MAX 32 + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct libmnt_table*, mnt_free_table); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct libmnt_iter*, mnt_free_iter); + +static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = { + [MOUNT_DEAD] = UNIT_INACTIVE, + [MOUNT_MOUNTING] = UNIT_ACTIVATING, + [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING, + [MOUNT_MOUNTED] = UNIT_ACTIVE, + [MOUNT_REMOUNTING] = UNIT_RELOADING, + [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING, + [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING, + [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING, + [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING, + [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING, + [MOUNT_FAILED] = UNIT_FAILED +}; + +static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); + +static bool MOUNT_STATE_WITH_PROCESS(MountState state) { + return IN_SET(state, + MOUNT_MOUNTING, + MOUNT_MOUNTING_DONE, + MOUNT_REMOUNTING, + MOUNT_REMOUNTING_SIGTERM, + MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL); +} + +static bool mount_is_network(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "_netdev\0")) + return true; + + if (p->fstype && fstype_is_network(p->fstype)) + return true; + + return false; +} + +static bool mount_is_loop(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "loop\0")) + return true; + + return false; +} + +static bool mount_is_bind(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "bind\0" "rbind\0")) + return true; + + if (p->fstype && STR_IN_SET(p->fstype, "bind", "rbind")) + return true; + + return false; +} + +static bool mount_is_auto(const MountParameters *p) { + assert(p); + + return !fstab_test_option(p->options, "noauto\0"); +} + +static bool mount_is_automount(const MountParameters *p) { + assert(p); + + return fstab_test_option(p->options, + "comment=systemd.automount\0" + "x-systemd.automount\0"); +} + +static bool mount_is_bound_to_device(const Mount *m) { + const MountParameters *p; + + if (m->from_fragment) + return true; + + p = &m->parameters_proc_self_mountinfo; + return fstab_test_option(p->options, "x-systemd.device-bound\0"); +} + +static bool mount_needs_quota(const MountParameters *p) { + assert(p); + + /* Quotas are not enabled on network filesystems, but we want them, for example, on storage connected via + * iscsi. We hence don't use mount_is_network() here, as that would also return true for _netdev devices. */ + if (p->fstype && fstype_is_network(p->fstype)) + return false; + + if (mount_is_bind(p)) + return false; + + return fstab_test_option(p->options, + "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0"); +} + +static void mount_init(Unit *u) { + Mount *m = MOUNT(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + m->timeout_usec = u->manager->default_timeout_start_usec; + + m->exec_context.std_output = u->manager->default_std_output; + m->exec_context.std_error = u->manager->default_std_error; + + m->directory_mode = 0755; + + /* We need to make sure that /usr/bin/mount is always called + * in the same process group as us, so that the autofs kernel + * side doesn't send us another mount request while we are + * already trying to comply its last one. */ + m->exec_context.same_pgrp = true; + + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + + u->ignore_on_isolate = true; +} + +static int mount_arm_timer(Mount *m, usec_t usec) { + int r; + + assert(m); + + if (m->timer_event_source) { + r = sd_event_source_set_time(m->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(m->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(m)->manager->event, + &m->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + mount_dispatch_timer, m); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->timer_event_source, "mount-timer"); + + return 0; +} + +static void mount_unwatch_control_pid(Mount *m) { + assert(m); + + if (m->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(m), m->control_pid); + m->control_pid = 0; +} + +static void mount_parameters_done(MountParameters *p) { + assert(p); + + p->what = mfree(p->what); + p->options = mfree(p->options); + p->fstype = mfree(p->fstype); +} + +static void mount_done(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + m->where = mfree(m->where); + + mount_parameters_done(&m->parameters_proc_self_mountinfo); + mount_parameters_done(&m->parameters_fragment); + + m->exec_runtime = exec_runtime_unref(m->exec_runtime, false); + exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); + m->control_command = NULL; + + dynamic_creds_unref(&m->dynamic_creds); + + mount_unwatch_control_pid(m); + + m->timer_event_source = sd_event_source_unref(m->timer_event_source); +} + +_pure_ static MountParameters* get_mount_parameters_fragment(Mount *m) { + assert(m); + + if (m->from_fragment) + return &m->parameters_fragment; + + return NULL; +} + +_pure_ static MountParameters* get_mount_parameters(Mount *m) { + assert(m); + + if (m->from_proc_self_mountinfo) + return &m->parameters_proc_self_mountinfo; + + return get_mount_parameters_fragment(m); +} + +static int update_parameters_proc_self_mount_info( + Mount *m, + const char *what, + const char *options, + const char *fstype) { + + MountParameters *p; + int r, q, w; + + p = &m->parameters_proc_self_mountinfo; + + r = free_and_strdup(&p->what, what); + if (r < 0) + return r; + + q = free_and_strdup(&p->options, options); + if (q < 0) + return q; + + w = free_and_strdup(&p->fstype, fstype); + if (w < 0) + return w; + + return r > 0 || q > 0 || w > 0; +} + +static int mount_add_mount_dependencies(Mount *m) { + MountParameters *pm; + Unit *other; + Iterator i; + Set *s; + int r; + + assert(m); + + if (!path_equal(m->where, "/")) { + _cleanup_free_ char *parent = NULL; + + /* Adds in links to other mount points that might lie further up in the hierarchy */ + + parent = dirname_malloc(m->where); + if (!parent) + return -ENOMEM; + + r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + } + + /* Adds in dependencies to other mount points that might be needed for the source path (if this is a bind mount + * or a loop mount) to be available. */ + pm = get_mount_parameters_fragment(m); + if (pm && pm->what && + path_is_absolute(pm->what) && + (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) { + + r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */ + s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where); + SET_FOREACH(other, s, i) { + + if (other->load_state != UNIT_LOADED) + continue; + + if (other == UNIT(m)) + continue; + + r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + + if (UNIT(m)->fragment_path) { + /* If we have fragment configuration, then make this dependency required */ + r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + } + } + + return 0; +} + +static int mount_add_device_dependencies(Mount *m) { + bool device_wants_mount; + UnitDependencyMask mask; + MountParameters *p; + UnitDependency dep; + int r; + + assert(m); + + p = get_mount_parameters(m); + if (!p) + return 0; + + if (!p->what) + return 0; + + if (mount_is_bind(p)) + return 0; + + if (!is_device_path(p->what)) + return 0; + + /* /dev/root is a really weird thing, it's not a real device, + * but just a path the kernel exports for the root file system + * specified on the kernel command line. Ignore it here. */ + if (path_equal(p->what, "/dev/root")) + return 0; + + if (path_equal(m->where, "/")) + return 0; + + device_wants_mount = + mount_is_auto(p) && !mount_is_automount(p) && MANAGER_IS_SYSTEM(UNIT(m)->manager); + + /* Mount units from /proc/self/mountinfo are not bound to devices + * by default since they're subject to races when devices are + * unplugged. But the user can still force this dep with an + * appropriate option (or udev property) so the mount units are + * automatically stopped when the device disappears suddenly. */ + dep = mount_is_bound_to_device(m) ? UNIT_BINDS_TO : UNIT_REQUIRES; + + /* We always use 'what' from /proc/self/mountinfo if mounted */ + mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT : UNIT_DEPENDENCY_FILE; + + r = unit_add_node_dependency(UNIT(m), p->what, device_wants_mount, dep, mask); + if (r < 0) + return r; + + return 0; +} + +static int mount_add_quota_dependencies(Mount *m) { + UnitDependencyMask mask; + MountParameters *p; + int r; + + assert(m); + + if (!MANAGER_IS_SYSTEM(UNIT(m)->manager)) + return 0; + + p = get_mount_parameters_fragment(m); + if (!p) + return 0; + + if (!mount_needs_quota(p)) + return 0; + + mask = m->from_fragment ? UNIT_DEPENDENCY_FILE : UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE, true, mask); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE, true, mask); + if (r < 0) + return r; + + return 0; +} + +static bool mount_is_extrinsic(Mount *m) { + MountParameters *p; + assert(m); + + /* Returns true for all units that are "magic" and should be excluded from the usual start-up and shutdown + * dependencies. We call them "extrinsic" here, as they are generally mounted outside of the systemd dependency + * logic. We shouldn't attempt to manage them ourselves but it's fine if the user operates on them with us. */ + + if (!MANAGER_IS_SYSTEM(UNIT(m)->manager)) /* We only automatically manage mounts if we are in system mode */ + return true; + + if (PATH_IN_SET(m->where, /* Don't bother with the OS data itself */ + "/", + "/usr")) + return true; + + if (PATH_STARTSWITH_SET(m->where, + "/run/initramfs", /* This should stay around from before we boot until after we shutdown */ + "/proc", /* All of this is API VFS */ + "/sys", /* … dito … */ + "/dev")) /* … dito … */ + return true; + + /* If this is an initrd mount, and we are not in the initrd, then leave this around forever, too. */ + p = get_mount_parameters(m); + if (p && fstab_test_option(p->options, "x-initrd.mount\0") && !in_initrd()) + return true; + + return false; +} + +static int mount_add_default_dependencies(Mount *m) { + const char *after, *before; + UnitDependencyMask mask; + MountParameters *p; + bool nofail; + int r; + + assert(m); + + if (!UNIT(m)->default_dependencies) + return 0; + + /* We do not add any default dependencies to /, /usr or /run/initramfs/, since they are guaranteed to stay + * mounted the whole time, since our system is on it. Also, don't bother with anything mounted below virtual + * file systems, it's also going to be virtual, and hence not worth the effort. */ + if (mount_is_extrinsic(m)) + return 0; + + p = get_mount_parameters(m); + if (!p) + return 0; + + mask = m->from_fragment ? UNIT_DEPENDENCY_FILE : UNIT_DEPENDENCY_MOUNTINFO_DEFAULT; + nofail = m->from_fragment ? fstab_test_yes_no_option(m->parameters_fragment.options, "nofail\0" "fail\0") : false; + + if (mount_is_network(p)) { + /* We order ourselves after network.target. This is + * primarily useful at shutdown: services that take + * down the network should order themselves before + * network.target, so that they are shut down only + * after this mount unit is stopped. */ + + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET, true, mask); + if (r < 0) + return r; + + /* We pull in network-online.target, and order + * ourselves after it. This is useful at start-up to + * actively pull in tools that want to be started + * before we start mounting network file systems, and + * whose purpose it is to delay this until the network + * is "up". */ + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET, true, mask); + if (r < 0) + return r; + + after = SPECIAL_REMOTE_FS_PRE_TARGET; + before = SPECIAL_REMOTE_FS_TARGET; + } else { + after = SPECIAL_LOCAL_FS_PRE_TARGET; + before = SPECIAL_LOCAL_FS_TARGET; + } + + if (!nofail) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, true, mask); + if (r < 0) + return r; + } + + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, true, mask); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, mask); + if (r < 0) + return r; + + /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */ + if (streq_ptr(p->fstype, "tmpfs")) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET, true, mask); + if (r < 0) + return r; + } + + return 0; +} + +static int mount_verify(Mount *m) { + _cleanup_free_ char *e = NULL; + MountParameters *p; + int r; + + assert(m); + + if (UNIT(m)->load_state != UNIT_LOADED) + return 0; + + if (!m->from_fragment && !m->from_proc_self_mountinfo && !UNIT(m)->perpetual) + return -ENOENT; + + r = unit_name_from_path(m->where, ".mount", &e); + if (r < 0) + return log_unit_error_errno(UNIT(m), r, "Failed to generate unit name from mount path: %m"); + + if (!unit_has_name(UNIT(m), e)) { + log_unit_error(UNIT(m), "Where= setting doesn't match unit name. Refusing."); + return -ENOEXEC; + } + + if (mount_point_is_api(m->where) || mount_point_ignore(m->where)) { + log_unit_error(UNIT(m), "Cannot create mount unit for API file system %s. Refusing.", m->where); + return -ENOEXEC; + } + + p = get_mount_parameters_fragment(m); + if (p && !p->what) { + log_unit_error(UNIT(m), "What= setting is missing. Refusing."); + return -ENOEXEC; + } + + if (m->exec_context.pam_name && m->kill_context.kill_mode != KILL_CONTROL_GROUP) { + log_unit_error(UNIT(m), "Unit has PAM enabled. Kill mode must be set to control-group'. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int mount_add_extras(Mount *m) { + Unit *u = UNIT(m); + int r; + + assert(m); + + /* Note: this call might be called after we already have been loaded once (and even when it has already been + * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready + * to run with an already set up unit. */ + + if (u->fragment_path) + m->from_fragment = true; + + if (!m->where) { + r = unit_name_to_path(u->id, &m->where); + if (r < 0) + return r; + } + + path_simplify(m->where, false); + + if (!u->description) { + r = unit_set_description(u, m->where); + if (r < 0) + return r; + } + + r = mount_add_device_dependencies(m); + if (r < 0) + return r; + + r = mount_add_mount_dependencies(m); + if (r < 0) + return r; + + r = mount_add_quota_dependencies(m); + if (r < 0) + return r; + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(u, &m->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = mount_add_default_dependencies(m); + if (r < 0) + return r; + + return 0; +} + +static int mount_load_root_mount(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_ROOT_MOUNT)) + return 0; + + u->perpetual = true; + u->default_dependencies = false; + + /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */ + MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL; + MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL; + + if (!u->description) + u->description = strdup("Root Mount"); + + return 1; +} + +static int mount_load(Unit *u) { + Mount *m = MOUNT(u); + int r, q, w; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = mount_load_root_mount(u); + + if (m->from_proc_self_mountinfo || u->perpetual) + q = unit_load_fragment_and_dropin_optional(u); + else + q = unit_load_fragment_and_dropin(u); + + /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the + * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus + * we need to update the state in our unit to track it. After all, consider that we don't allow changing the + * 'slice' field for a unit once it is active. */ + if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual) + w = mount_add_extras(m); + else + w = 0; + + if (r < 0) + return r; + if (q < 0) + return q; + if (w < 0) + return w; + + return mount_verify(m); +} + +static void mount_set_state(Mount *m, MountState state) { + MountState old_state; + assert(m); + + if (m->state != state) + bus_unit_send_pending_change_signal(UNIT(m), false); + + old_state = m->state; + m->state = state; + + if (!MOUNT_STATE_WITH_PROCESS(state)) { + m->timer_event_source = sd_event_source_unref(m->timer_event_source); + mount_unwatch_control_pid(m); + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + } + + if (state != old_state) + log_unit_debug(UNIT(m), "Changed %s -> %s", mount_state_to_string(old_state), mount_state_to_string(state)); + + unit_notify(UNIT(m), state_translation_table[old_state], state_translation_table[state], + m->reload_result == MOUNT_SUCCESS ? 0 : UNIT_NOTIFY_RELOAD_FAILURE); +} + +static int mount_coldplug(Unit *u) { + Mount *m = MOUNT(u); + MountState new_state = MOUNT_DEAD; + int r; + + assert(m); + assert(m->state == MOUNT_DEAD); + + if (m->deserialized_state != m->state) + new_state = m->deserialized_state; + else if (m->from_proc_self_mountinfo) + new_state = MOUNT_MOUNTED; + + if (new_state == m->state) + return 0; + + if (m->control_pid > 0 && + pid_is_unwaited(m->control_pid) && + MOUNT_STATE_WITH_PROCESS(new_state)) { + + r = unit_watch_pid(UNIT(m), m->control_pid); + if (r < 0) + return r; + + r = mount_arm_timer(m, usec_add(u->state_change_timestamp.monotonic, m->timeout_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(new_state, MOUNT_DEAD, MOUNT_FAILED)) { + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + mount_set_state(m, new_state); + return 0; +} + +static void mount_dump(Unit *u, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESPAN_MAX]; + Mount *m = MOUNT(u); + MountParameters *p; + + assert(m); + assert(f); + + p = get_mount_parameters(m); + + fprintf(f, + "%sMount State: %s\n" + "%sResult: %s\n" + "%sWhere: %s\n" + "%sWhat: %s\n" + "%sFile System Type: %s\n" + "%sOptions: %s\n" + "%sFrom /proc/self/mountinfo: %s\n" + "%sFrom fragment: %s\n" + "%sExtrinsic: %s\n" + "%sDirectoryMode: %04o\n" + "%sSloppyOptions: %s\n" + "%sLazyUnmount: %s\n" + "%sForceUnmount: %s\n" + "%sTimeoutSec: %s\n", + prefix, mount_state_to_string(m->state), + prefix, mount_result_to_string(m->result), + prefix, m->where, + prefix, p ? strna(p->what) : "n/a", + prefix, p ? strna(p->fstype) : "n/a", + prefix, p ? strna(p->options) : "n/a", + prefix, yes_no(m->from_proc_self_mountinfo), + prefix, yes_no(m->from_fragment), + prefix, yes_no(mount_is_extrinsic(m)), + prefix, m->directory_mode, + prefix, yes_no(m->sloppy_options), + prefix, yes_no(m->lazy_unmount), + prefix, yes_no(m->force_unmount), + prefix, format_timespan(buf, sizeof(buf), m->timeout_usec, USEC_PER_SEC)); + + if (m->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, m->control_pid); + + exec_context_dump(&m->exec_context, f, prefix); + kill_context_dump(&m->kill_context, f, prefix); + cgroup_context_dump(&m->cgroup_context, f, prefix); +} + +static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + pid_t pid; + int r; + + assert(m); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(m)); + if (r < 0) + return r; + + r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec)); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(m), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(m), + c, + &m->exec_context, + &exec_params, + m->exec_runtime, + &m->dynamic_creds, + &pid); + if (r < 0) + return r; + + r = unit_watch_pid(UNIT(m), pid); + if (r < 0) + /* FIXME: we need to do something here */ + return r; + + *_pid = pid; + + return 0; +} + +static void mount_enter_dead(Mount *m, MountResult f) { + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result)); + mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD); + + m->exec_runtime = exec_runtime_unref(m->exec_runtime, true); + + exec_context_destroy_runtime_directory(&m->exec_context, UNIT(m)->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + unit_unref_uid_gid(UNIT(m), true); + + dynamic_creds_destroy(&m->dynamic_creds); + + /* Any dependencies based on /proc/self/mountinfo are now stale */ + unit_remove_dependencies(UNIT(m), UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT); +} + +static void mount_enter_mounted(Mount *m, MountResult f) { + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + mount_set_state(m, MOUNT_MOUNTED); +} + +static void mount_enter_dead_or_mounted(Mount *m, MountResult f) { + assert(m); + + /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this + * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all + * ultimately we just mirror the kernel's internal state on this. */ + + if (m->from_proc_self_mountinfo) + mount_enter_mounted(m, f); + else + mount_enter_dead(m, f); +} + +static int state_to_kill_operation(MountState state) { + switch (state) { + + case MOUNT_REMOUNTING_SIGTERM: + case MOUNT_UNMOUNTING_SIGTERM: + return KILL_TERMINATE; + + case MOUNT_REMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGKILL: + return KILL_KILL; + + default: + return _KILL_OPERATION_INVALID; + } +} + +static void mount_enter_signal(Mount *m, MountState state, MountResult f) { + int r; + + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + r = unit_kill_context( + UNIT(m), + &m->kill_context, + state_to_kill_operation(state), + -1, + m->control_pid, + false); + if (r < 0) + goto fail; + + if (r > 0) { + r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec)); + if (r < 0) + goto fail; + + mount_set_state(m, state); + } else if (state == MOUNT_REMOUNTING_SIGTERM && m->kill_context.send_sigkill) + mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); + else if (IN_SET(state, MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL)) + mount_enter_mounted(m, MOUNT_SUCCESS); + else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill) + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS); + else + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m"); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_enter_unmounting(Mount *m) { + int r; + + assert(m); + + /* Start counting our attempts */ + if (!IN_SET(m->state, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL)) + m->n_retry_umount = 0; + + m->control_command_id = MOUNT_EXEC_UNMOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_UNMOUNT; + + r = exec_command_set(m->control_command, UMOUNT_PATH, m->where, "-c", NULL); + if (r >= 0 && m->lazy_unmount) + r = exec_command_append(m->control_command, "-l", NULL); + if (r >= 0 && m->force_unmount) + r = exec_command_append(m->control_command, "-f", NULL); + if (r < 0) + goto fail; + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_UNMOUNTING); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to run 'umount' task: %m"); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_enter_mounting(Mount *m) { + int r; + MountParameters *p; + + assert(m); + + r = unit_fail_if_noncanonical(UNIT(m), m->where); + if (r < 0) + goto fail; + + (void) mkdir_p_label(m->where, m->directory_mode); + + unit_warn_if_dir_nonempty(UNIT(m), m->where); + unit_warn_leftover_processes(UNIT(m)); + + m->control_command_id = MOUNT_EXEC_MOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_MOUNT; + + /* Create the source directory for bind-mounts if needed */ + p = get_mount_parameters_fragment(m); + if (p && mount_is_bind(p)) + (void) mkdir_p_label(p->what, m->directory_mode); + + if (p) { + _cleanup_free_ char *opts = NULL; + + r = fstab_filter_options(p->options, "nofail\0" "noauto\0" "auto\0", NULL, NULL, &opts); + if (r < 0) + goto fail; + + r = exec_command_set(m->control_command, MOUNT_PATH, p->what, m->where, NULL); + if (r >= 0 && m->sloppy_options) + r = exec_command_append(m->control_command, "-s", NULL); + if (r >= 0 && p->fstype) + r = exec_command_append(m->control_command, "-t", p->fstype, NULL); + if (r >= 0 && !isempty(opts)) + r = exec_command_append(m->control_command, "-o", opts, NULL); + } else + r = -ENOENT; + if (r < 0) + goto fail; + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_MOUNTING); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to run 'mount' task: %m"); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_set_reload_result(Mount *m, MountResult result) { + assert(m); + + /* Only store the first error we encounter */ + if (m->reload_result != MOUNT_SUCCESS) + return; + + m->reload_result = result; +} + +static void mount_enter_remounting(Mount *m) { + int r; + MountParameters *p; + + assert(m); + + /* Reset reload result when we are about to start a new remount operation */ + m->reload_result = MOUNT_SUCCESS; + + m->control_command_id = MOUNT_EXEC_REMOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_REMOUNT; + + p = get_mount_parameters_fragment(m); + if (p) { + const char *o; + + if (p->options) + o = strjoina("remount,", p->options); + else + o = "remount"; + + r = exec_command_set(m->control_command, MOUNT_PATH, + p->what, m->where, + "-o", o, NULL); + if (r >= 0 && m->sloppy_options) + r = exec_command_append(m->control_command, "-s", NULL); + if (r >= 0 && p->fstype) + r = exec_command_append(m->control_command, "-t", p->fstype, NULL); + } else + r = -ENOENT; + if (r < 0) + goto fail; + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_REMOUNTING); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to run 'remount' task: %m"); + mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); +} + +static void mount_cycle_clear(Mount *m) { + assert(m); + + /* Clear all state we shall forget for this new cycle */ + + m->result = MOUNT_SUCCESS; + m->reload_result = MOUNT_SUCCESS; + exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); + UNIT(m)->reset_accounting = true; +} + +static int mount_start(Unit *u) { + Mount *m = MOUNT(u); + int r; + + assert(m); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(m->state, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL)) + return -EAGAIN; + + /* Already on it! */ + if (m->state == MOUNT_MOUNTING) + return 0; + + assert(IN_SET(m->state, MOUNT_DEAD, MOUNT_FAILED)); + + r = unit_start_limit_test(u); + if (r < 0) { + mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + mount_cycle_clear(m); + mount_enter_mounting(m); + + return 1; +} + +static int mount_stop(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + switch (m->state) { + + case MOUNT_UNMOUNTING: + case MOUNT_UNMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGTERM: + /* Already on it */ + return 0; + + case MOUNT_MOUNTING: + case MOUNT_MOUNTING_DONE: + case MOUNT_REMOUNTING: + /* If we are still waiting for /bin/mount, we go directly into kill mode. */ + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_SUCCESS); + return 0; + + case MOUNT_REMOUNTING_SIGTERM: + /* If we are already waiting for a hung remount, convert this to the matching unmounting state */ + mount_set_state(m, MOUNT_UNMOUNTING_SIGTERM); + return 0; + + case MOUNT_REMOUNTING_SIGKILL: + /* as above */ + mount_set_state(m, MOUNT_UNMOUNTING_SIGKILL); + return 0; + + case MOUNT_MOUNTED: + mount_enter_unmounting(m); + return 1; + + default: + assert_not_reached("Unexpected state."); + } +} + +static int mount_reload(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + assert(m->state == MOUNT_MOUNTED); + + mount_enter_remounting(m); + + return 1; +} + +static int mount_serialize(Unit *u, FILE *f, FDSet *fds) { + Mount *m = MOUNT(u); + + assert(m); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", mount_state_to_string(m->state)); + (void) serialize_item(f, "result", mount_result_to_string(m->result)); + (void) serialize_item(f, "reload-result", mount_result_to_string(m->reload_result)); + (void) serialize_item_format(f, "n-retry-umount", "%u", m->n_retry_umount); + + if (m->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, m->control_pid); + + if (m->control_command_id >= 0) + (void) serialize_item(f, "control-command", mount_exec_command_to_string(m->control_command_id)); + + return 0; +} + +static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Mount *m = MOUNT(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + MountState state; + + if ((state = mount_state_from_string(value)) < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + m->deserialized_state = state; + + } else if (streq(key, "result")) { + MountResult f; + + f = mount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != MOUNT_SUCCESS) + m->result = f; + + } else if (streq(key, "reload-result")) { + MountResult f; + + f = mount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse reload result value: %s", value); + else if (f != MOUNT_SUCCESS) + m->reload_result = f; + + } else if (streq(key, "n-retry-umount")) { + + r = safe_atou(value, &m->n_retry_umount); + if (r < 0) + log_unit_debug(u, "Failed to parse n-retry-umount value: %s", value); + + } else if (streq(key, "control-pid")) { + + if (parse_pid(value, &m->control_pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + + } else if (streq(key, "control-command")) { + MountExecCommand id; + + id = mount_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + m->control_command_id = id; + m->control_command = m->exec_command + id; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState mount_active_state(Unit *u) { + assert(u); + + return state_translation_table[MOUNT(u)->state]; +} + +_pure_ static const char *mount_sub_state_to_string(Unit *u) { + assert(u); + + return mount_state_to_string(MOUNT(u)->state); +} + +_pure_ static bool mount_may_gc(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + if (m->from_proc_self_mountinfo) + return false; + + return true; +} + +static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Mount *m = MOUNT(u); + MountResult f; + + assert(m); + assert(pid >= 0); + + if (pid != m->control_pid) + return; + + m->control_pid = 0; + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = MOUNT_SUCCESS; + else if (code == CLD_EXITED) + f = MOUNT_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = MOUNT_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = MOUNT_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown code"); + + if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM)) + mount_set_reload_result(m, f); + else if (m->result == MOUNT_SUCCESS) + m->result = f; + + if (m->control_command) { + exec_status_exit(&m->control_command->exec_status, &m->exec_context, pid, code, status); + + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + } + + unit_log_process_exit( + u, f == MOUNT_SUCCESS ? LOG_DEBUG : LOG_NOTICE, + "Mount process", + mount_exec_command_to_string(m->control_command_id), + code, status); + + /* Note that due to the io event priority logic, we can be sure the new mountinfo is loaded + * before we process the SIGCHLD for the mount command. */ + + switch (m->state) { + + case MOUNT_MOUNTING: + /* Our mount point has not appeared in mountinfo. Something went wrong. */ + + if (f == MOUNT_SUCCESS) { + /* Either /bin/mount has an unexpected definition of success, + * or someone raced us and we lost. */ + log_unit_warning(UNIT(m), "Mount process finished, but there is no mount."); + f = MOUNT_FAILURE_PROTOCOL; + } + mount_enter_dead(m, f); + break; + + case MOUNT_MOUNTING_DONE: + mount_enter_mounted(m, f); + break; + + case MOUNT_REMOUNTING: + case MOUNT_REMOUNTING_SIGTERM: + case MOUNT_REMOUNTING_SIGKILL: + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + break; + + case MOUNT_UNMOUNTING: + + if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) { + + /* Still a mount point? If so, let's try again. Most likely there were multiple mount points + * stacked on top of each other. We might exceed the timeout specified by the user overall, + * but we will stop as soon as any one umount times out. */ + + if (m->n_retry_umount < RETRY_UMOUNT_MAX) { + log_unit_debug(u, "Mount still present, trying again."); + m->n_retry_umount++; + mount_enter_unmounting(m); + } else { + log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount); + mount_enter_mounted(m, f); + } + } else + mount_enter_dead_or_mounted(m, f); + + break; + + case MOUNT_UNMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGTERM: + mount_enter_dead_or_mounted(m, f); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Mount *m = MOUNT(userdata); + + assert(m); + assert(m->timer_event_source == source); + + switch (m->state) { + + case MOUNT_MOUNTING: + case MOUNT_MOUNTING_DONE: + log_unit_warning(UNIT(m), "Mounting timed out. Terminating."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_REMOUNTING: + log_unit_warning(UNIT(m), "Remounting timed out. Terminating remount process."); + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + mount_enter_signal(m, MOUNT_REMOUNTING_SIGTERM, MOUNT_SUCCESS); + break; + + case MOUNT_REMOUNTING_SIGTERM: + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + + if (m->kill_context.send_sigkill) { + log_unit_warning(UNIT(m), "Remounting timed out. Killing."); + mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); + } else { + log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + } + break; + + case MOUNT_REMOUNTING_SIGKILL: + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + + log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + break; + + case MOUNT_UNMOUNTING: + log_unit_warning(UNIT(m), "Unmounting timed out. Terminating."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_UNMOUNTING_SIGTERM: + if (m->kill_context.send_sigkill) { + log_unit_warning(UNIT(m), "Mount process timed out. Killing."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + } + break; + + case MOUNT_UNMOUNTING_SIGKILL: + log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +static int mount_setup_new_unit( + Manager *m, + const char *name, + const char *what, + const char *where, + const char *options, + const char *fstype, + MountProcFlags *ret_flags, + Unit **ret) { + + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + assert(m); + assert(name); + assert(ret_flags); + assert(ret); + + r = unit_new_for_name(m, sizeof(Mount), name, &u); + if (r < 0) + return r; + + r = free_and_strdup(&u->source_path, "/proc/self/mountinfo"); + if (r < 0) + return r; + + r = free_and_strdup(&MOUNT(u)->where, where); + if (r < 0) + return r; + + r = update_parameters_proc_self_mount_info(MOUNT(u), what, options, fstype); + if (r < 0) + return r; + + /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the time we load + * the unit file for it (and thus add in extra deps right after) we know what source to attributes the deps + * to.*/ + MOUNT(u)->from_proc_self_mountinfo = true; + + /* We have only allocated the stub now, let's enqueue this unit for loading now, so that everything else is + * loaded in now. */ + unit_add_to_load_queue(u); + + *ret_flags = MOUNT_PROC_IS_MOUNTED | MOUNT_PROC_JUST_MOUNTED | MOUNT_PROC_JUST_CHANGED; + *ret = TAKE_PTR(u); + return 0; +} + +static int mount_setup_existing_unit( + Unit *u, + const char *what, + const char *where, + const char *options, + const char *fstype, + MountProcFlags *ret_flags) { + + MountProcFlags flags = MOUNT_PROC_IS_MOUNTED; + int r; + + assert(u); + assert(flags); + + if (!MOUNT(u)->where) { + MOUNT(u)->where = strdup(where); + if (!MOUNT(u)->where) + return -ENOMEM; + } + + r = update_parameters_proc_self_mount_info(MOUNT(u), what, options, fstype); + if (r < 0) + return r; + if (r > 0) + flags |= MOUNT_PROC_JUST_CHANGED; + + if (!MOUNT(u)->from_proc_self_mountinfo || FLAGS_SET(MOUNT(u)->proc_flags, MOUNT_PROC_JUST_MOUNTED)) + flags |= MOUNT_PROC_JUST_MOUNTED; + + MOUNT(u)->from_proc_self_mountinfo = true; + + if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in + * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */ + u->load_state = UNIT_LOADED; + u->load_error = 0; + + flags |= MOUNT_PROC_JUST_CHANGED; + } + + if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) { + /* If things changed, then make sure that all deps are regenerated. Let's + * first remove all automatic deps, and then add in the new ones. */ + + unit_remove_dependencies(u, UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT); + + r = mount_add_extras(MOUNT(u)); + if (r < 0) + return r; + } + + *ret_flags = flags; + return 0; +} + +static int mount_setup_unit( + Manager *m, + const char *what, + const char *where, + const char *options, + const char *fstype, + bool set_flags) { + + _cleanup_free_ char *e = NULL; + MountProcFlags flags; + Unit *u; + int r; + + assert(m); + assert(what); + assert(where); + assert(options); + assert(fstype); + + /* Ignore API mount points. They should never be referenced in + * dependencies ever. */ + if (mount_point_is_api(where) || mount_point_ignore(where)) + return 0; + + if (streq(fstype, "autofs")) + return 0; + + /* probably some kind of swap, ignore */ + if (!is_path(where)) + return 0; + + r = unit_name_from_path(where, ".mount", &e); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name from path '%s': %m", where); + + u = manager_get_unit(m, e); + if (u) + r = mount_setup_existing_unit(u, what, where, options, fstype, &flags); + else + /* First time we see this mount point meaning that it's not been initiated by a mount unit but rather + * by the sysadmin having called mount(8) directly. */ + r = mount_setup_new_unit(m, e, what, where, options, fstype, &flags, &u); + if (r < 0) + return log_warning_errno(r, "Failed to set up mount unit: %m"); + + /* If the mount changed properties or state, let's notify our clients */ + if (flags & (MOUNT_PROC_JUST_CHANGED|MOUNT_PROC_JUST_MOUNTED)) + unit_add_to_dbus_queue(u); + + if (set_flags) + MOUNT(u)->proc_flags = flags; + + return 0; +} + +static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) { + _cleanup_(mnt_free_tablep) struct libmnt_table *t = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *i = NULL; + int r; + + assert(m); + + t = mnt_new_table(); + i = mnt_new_iter(MNT_ITER_FORWARD); + if (!t || !i) + return log_oom(); + + r = mnt_table_parse_mtab(t, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + struct libmnt_fs *fs; + const char *device, *path, *options, *fstype; + _cleanup_free_ char *d = NULL, *p = NULL; + int k; + + k = mnt_table_next_fs(t, i, &fs); + if (k == 1) + break; + if (k < 0) + return log_error_errno(k, "Failed to get next entry from /proc/self/mountinfo: %m"); + + device = mnt_fs_get_source(fs); + path = mnt_fs_get_target(fs); + options = mnt_fs_get_options(fs); + fstype = mnt_fs_get_fstype(fs); + + if (!device || !path) + continue; + + if (cunescape(device, UNESCAPE_RELAX, &d) < 0) + return log_oom(); + + if (cunescape(path, UNESCAPE_RELAX, &p) < 0) + return log_oom(); + + device_found_node(m, d, DEVICE_FOUND_MOUNT, DEVICE_FOUND_MOUNT); + + (void) mount_setup_unit(m, d, p, options, fstype, set_flags); + } + + return 0; +} + +static void mount_shutdown(Manager *m) { + assert(m); + + m->mount_event_source = sd_event_source_unref(m->mount_event_source); + + mnt_unref_monitor(m->mount_monitor); + m->mount_monitor = NULL; +} + +static int mount_get_timeout(Unit *u, usec_t *timeout) { + Mount *m = MOUNT(u); + usec_t t; + int r; + + if (!m->timer_event_source) + return 0; + + r = sd_event_source_get_time(m->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static void mount_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Whatever happens, we know for sure that the root directory is around, and cannot go away. Let's + * unconditionally synthesize it here and mark it as perpetual. */ + + u = manager_get_unit(m, SPECIAL_ROOT_MOUNT); + if (!u) { + r = unit_new_for_name(m, sizeof(Mount), SPECIAL_ROOT_MOUNT, &u); + if (r < 0) { + log_error_errno(r, "Failed to allocate the special " SPECIAL_ROOT_MOUNT " unit: %m"); + return; + } + } + + u->perpetual = true; + MOUNT(u)->deserialized_state = MOUNT_MOUNTED; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); +} + +static bool mount_is_mounted(Mount *m) { + assert(m); + + return UNIT(m)->perpetual || FLAGS_SET(m->proc_flags, MOUNT_PROC_IS_MOUNTED); +} + +static void mount_enumerate(Manager *m) { + int r; + + assert(m); + + mnt_init_debug(0); + + if (!m->mount_monitor) { + int fd; + + m->mount_monitor = mnt_new_monitor(); + if (!m->mount_monitor) { + log_oom(); + goto fail; + } + + r = mnt_monitor_enable_kernel(m->mount_monitor, 1); + if (r < 0) { + log_error_errno(r, "Failed to enable watching of kernel mount events: %m"); + goto fail; + } + + r = mnt_monitor_enable_userspace(m->mount_monitor, 1, NULL); + if (r < 0) { + log_error_errno(r, "Failed to enable watching of userspace mount events: %m"); + goto fail; + } + + /* mnt_unref_monitor() will close the fd */ + fd = r = mnt_monitor_get_fd(m->mount_monitor); + if (r < 0) { + log_error_errno(r, "Failed to acquire watch file descriptor: %m"); + goto fail; + } + + r = sd_event_add_io(m->event, &m->mount_event_source, fd, EPOLLIN, mount_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to watch mount file descriptor: %m"); + goto fail; + } + + r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) { + log_error_errno(r, "Failed to adjust mount watch priority: %m"); + goto fail; + } + + (void) sd_event_source_set_description(m->mount_event_source, "mount-monitor-dispatch"); + } + + r = mount_load_proc_self_mountinfo(m, false); + if (r < 0) + goto fail; + + return; + +fail: + mount_shutdown(m); +} + +static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + _cleanup_set_free_free_ Set *around = NULL, *gone = NULL; + Manager *m = userdata; + const char *what; + Iterator i; + Unit *u; + int r; + + assert(m); + assert(revents & EPOLLIN); + + if (fd == mnt_monitor_get_fd(m->mount_monitor)) { + bool rescan = false; + + /* Drain all events and verify that the event is valid. + * + * Note that libmount also monitors /run/mount mkdir if the + * directory does not exist yet. The mkdir may generate event + * which is irrelevant for us. + * + * error: r < 0; valid: r == 0, false positive: rc == 1 */ + do { + r = mnt_monitor_next_change(m->mount_monitor, NULL, NULL); + if (r == 0) + rescan = true; + else if (r < 0) + return log_error_errno(r, "Failed to drain libmount events: %m"); + } while (r == 0); + + log_debug("libmount event [rescan: %s]", yes_no(rescan)); + if (!rescan) + return 0; + } + + r = mount_load_proc_self_mountinfo(m, true); + if (r < 0) { + /* Reset flags, just in case, for later calls */ + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) + MOUNT(u)->proc_flags = 0; + + return 0; + } + + manager_dispatch_load_queue(m); + + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) { + Mount *mount = MOUNT(u); + + if (!mount_is_mounted(mount)) { + + /* A mount point is not around right now. It + * might be gone, or might never have + * existed. */ + + if (mount->from_proc_self_mountinfo && + mount->parameters_proc_self_mountinfo.what) { + + /* Remember that this device might just have disappeared */ + if (set_ensure_allocated(&gone, &path_hash_ops) < 0 || + set_put_strdup(gone, mount->parameters_proc_self_mountinfo.what) < 0) + log_oom(); /* we don't care too much about OOM here... */ + } + + mount->from_proc_self_mountinfo = false; + assert_se(update_parameters_proc_self_mount_info(mount, NULL, NULL, NULL) >= 0); + + switch (mount->state) { + + case MOUNT_MOUNTED: + /* This has just been unmounted by somebody else, follow the state change. */ + mount_enter_dead(mount, MOUNT_SUCCESS); + break; + + default: + break; + } + + } else if (mount->proc_flags & (MOUNT_PROC_JUST_MOUNTED|MOUNT_PROC_JUST_CHANGED)) { + + /* A mount point was added or changed */ + + switch (mount->state) { + + case MOUNT_DEAD: + case MOUNT_FAILED: + + /* This has just been mounted by somebody else, follow the state change, but let's + * generate a new invocation ID for this implicitly and automatically. */ + (void) unit_acquire_invocation_id(u); + mount_cycle_clear(mount); + mount_enter_mounted(mount, MOUNT_SUCCESS); + break; + + case MOUNT_MOUNTING: + mount_set_state(mount, MOUNT_MOUNTING_DONE); + break; + + default: + /* Nothing really changed, but let's + * issue an notification call + * nonetheless, in case somebody is + * waiting for this. (e.g. file system + * ro/rw remounts.) */ + mount_set_state(mount, mount->state); + break; + } + } + + if (mount_is_mounted(mount) && + mount->from_proc_self_mountinfo && + mount->parameters_proc_self_mountinfo.what) { + /* Track devices currently used */ + + if (set_ensure_allocated(&around, &path_hash_ops) < 0 || + set_put_strdup(around, mount->parameters_proc_self_mountinfo.what) < 0) + log_oom(); + } + + /* Reset the flags for later calls */ + mount->proc_flags = 0; + } + + SET_FOREACH(what, gone, i) { + if (set_contains(around, what)) + continue; + + /* Let the device units know that the device is no longer mounted */ + device_found_node(m, what, 0, DEVICE_FOUND_MOUNT); + } + + return 0; +} + +static void mount_reset_failed(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + if (m->state == MOUNT_FAILED) + mount_set_state(m, MOUNT_DEAD); + + m->result = MOUNT_SUCCESS; + m->reload_result = MOUNT_SUCCESS; +} + +static int mount_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + Mount *m = MOUNT(u); + + assert(m); + + return unit_kill_common(u, who, signo, -1, MOUNT(u)->control_pid, error); +} + +static int mount_control_pid(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + return m->control_pid; +} + +static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = { + [MOUNT_EXEC_MOUNT] = "ExecMount", + [MOUNT_EXEC_UNMOUNT] = "ExecUnmount", + [MOUNT_EXEC_REMOUNT] = "ExecRemount", +}; + +DEFINE_STRING_TABLE_LOOKUP(mount_exec_command, MountExecCommand); + +static const char* const mount_result_table[_MOUNT_RESULT_MAX] = { + [MOUNT_SUCCESS] = "success", + [MOUNT_FAILURE_RESOURCES] = "resources", + [MOUNT_FAILURE_TIMEOUT] = "timeout", + [MOUNT_FAILURE_EXIT_CODE] = "exit-code", + [MOUNT_FAILURE_SIGNAL] = "signal", + [MOUNT_FAILURE_CORE_DUMP] = "core-dump", + [MOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [MOUNT_FAILURE_PROTOCOL] = "protocol", +}; + +DEFINE_STRING_TABLE_LOOKUP(mount_result, MountResult); + +const UnitVTable mount_vtable = { + .object_size = sizeof(Mount), + .exec_context_offset = offsetof(Mount, exec_context), + .cgroup_context_offset = offsetof(Mount, cgroup_context), + .kill_context_offset = offsetof(Mount, kill_context), + .exec_runtime_offset = offsetof(Mount, exec_runtime), + .dynamic_creds_offset = offsetof(Mount, dynamic_creds), + + .sections = + "Unit\0" + "Mount\0" + "Install\0", + .private_section = "Mount", + + .init = mount_init, + .load = mount_load, + .done = mount_done, + + .coldplug = mount_coldplug, + + .dump = mount_dump, + + .start = mount_start, + .stop = mount_stop, + .reload = mount_reload, + + .kill = mount_kill, + + .serialize = mount_serialize, + .deserialize_item = mount_deserialize_item, + + .active_state = mount_active_state, + .sub_state_to_string = mount_sub_state_to_string, + + .may_gc = mount_may_gc, + + .sigchld_event = mount_sigchld_event, + + .reset_failed = mount_reset_failed, + + .control_pid = mount_control_pid, + + .bus_vtable = bus_mount_vtable, + .bus_set_property = bus_mount_set_property, + .bus_commit_properties = bus_mount_commit_properties, + + .get_timeout = mount_get_timeout, + + .can_transient = true, + + .enumerate_perpetual = mount_enumerate_perpetual, + .enumerate = mount_enumerate, + .shutdown = mount_shutdown, + + .status_message_formats = { + .starting_stopping = { + [0] = "Mounting %s...", + [1] = "Unmounting %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Mounted %s.", + [JOB_FAILED] = "Failed to mount %s.", + [JOB_TIMEOUT] = "Timed out mounting %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Unmounted %s.", + [JOB_FAILED] = "Failed unmounting %s.", + [JOB_TIMEOUT] = "Timed out unmounting %s.", + }, + }, +}; diff --git a/src/core/mount.h b/src/core/mount.h new file mode 100644 index 0000000..2e59f1f --- /dev/null +++ b/src/core/mount.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Mount Mount; + +#include "kill.h" +#include "dynamic-user.h" +#include "unit.h" + +typedef enum MountExecCommand { + MOUNT_EXEC_MOUNT, + MOUNT_EXEC_UNMOUNT, + MOUNT_EXEC_REMOUNT, + _MOUNT_EXEC_COMMAND_MAX, + _MOUNT_EXEC_COMMAND_INVALID = -1 +} MountExecCommand; + +typedef enum MountResult { + MOUNT_SUCCESS, + MOUNT_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */ + MOUNT_FAILURE_TIMEOUT, + MOUNT_FAILURE_EXIT_CODE, + MOUNT_FAILURE_SIGNAL, + MOUNT_FAILURE_CORE_DUMP, + MOUNT_FAILURE_START_LIMIT_HIT, + MOUNT_FAILURE_PROTOCOL, + _MOUNT_RESULT_MAX, + _MOUNT_RESULT_INVALID = -1 +} MountResult; + +typedef struct MountParameters { + char *what; + char *options; + char *fstype; +} MountParameters; + +/* Used while looking for mount points that vanished or got added from/to /proc/self/mountinfo */ +typedef enum MountProcFlags { + MOUNT_PROC_IS_MOUNTED = 1 << 0, + MOUNT_PROC_JUST_MOUNTED = 1 << 1, + MOUNT_PROC_JUST_CHANGED = 1 << 2, +} MountProcFlags; + +struct Mount { + Unit meta; + + char *where; + + MountParameters parameters_proc_self_mountinfo; + MountParameters parameters_fragment; + + bool from_proc_self_mountinfo:1; + bool from_fragment:1; + + MountProcFlags proc_flags; + + bool sloppy_options; + + bool lazy_unmount; + bool force_unmount; + + MountResult result; + MountResult reload_result; + + mode_t directory_mode; + + usec_t timeout_usec; + + ExecCommand exec_command[_MOUNT_EXEC_COMMAND_MAX]; + + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + MountState state, deserialized_state; + + ExecCommand* control_command; + MountExecCommand control_command_id; + pid_t control_pid; + + sd_event_source *timer_event_source; + + unsigned n_retry_umount; +}; + +extern const UnitVTable mount_vtable; + +void mount_fd_event(Manager *m, int events); + +const char* mount_exec_command_to_string(MountExecCommand i) _const_; +MountExecCommand mount_exec_command_from_string(const char *s) _pure_; + +const char* mount_result_to_string(MountResult i) _const_; +MountResult mount_result_from_string(const char *s) _pure_; + +DEFINE_CAST(MOUNT, Mount); diff --git a/src/core/namespace.c b/src/core/namespace.c new file mode 100644 index 0000000..7f553a4 --- /dev/null +++ b/src/core/namespace.c @@ -0,0 +1,1733 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <unistd.h> +#include <linux/fs.h> + +#include "alloc-util.h" +#include "base-filesystem.h" +#include "dev-setup.h" +#include "fd-util.h" +#include "fs-util.h" +#include "label.h" +#include "loop-util.h" +#include "loopback-setup.h" +#include "missing.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace.h" +#include "path-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "umask-util.h" +#include "user-util.h" +#include "util.h" + +#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC) + +typedef enum MountMode { + /* This is ordered by priority! */ + INACCESSIBLE, + BIND_MOUNT, + BIND_MOUNT_RECURSIVE, + PRIVATE_TMP, + PRIVATE_DEV, + BIND_DEV, + EMPTY_DIR, + SYSFS, + PROCFS, + READONLY, + READWRITE, + TMPFS, +} MountMode; + +typedef struct MountEntry { + const char *path_const; /* Memory allocated on stack or static */ + MountMode mode:5; + bool ignore:1; /* Ignore if path does not exist? */ + bool has_prefix:1; /* Already is prefixed by the root dir? */ + bool read_only:1; /* Shall this mount point be read-only? */ + bool applied:1; /* Already applied */ + char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */ + const char *source_const; /* The source path, for bind mounts */ + char *source_malloc; + const char *options_const;/* Mount options for tmpfs */ + char *options_malloc; + unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */ + unsigned n_followed; +} MountEntry; + +/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted + * something there already. These mounts are hence overridden by any other explicitly configured mounts. */ +static const MountEntry apivfs_table[] = { + { "/proc", PROCFS, false }, + { "/dev", BIND_DEV, false }, + { "/sys", SYSFS, false }, +}; + +/* ProtectKernelTunables= option and the related filesystem APIs */ +static const MountEntry protect_kernel_tunables_table[] = { + { "/proc/acpi", READONLY, true }, + { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */ + { "/proc/asound", READONLY, true }, + { "/proc/bus", READONLY, true }, + { "/proc/fs", READONLY, true }, + { "/proc/irq", READONLY, true }, + { "/proc/kallsyms", INACCESSIBLE, true }, + { "/proc/kcore", INACCESSIBLE, true }, + { "/proc/latency_stats", READONLY, true }, + { "/proc/mtrr", READONLY, true }, + { "/proc/scsi", READONLY, true }, + { "/proc/sys", READONLY, false }, + { "/proc/sysrq-trigger", READONLY, true }, + { "/proc/timer_stats", READONLY, true }, + { "/sys", READONLY, false }, + { "/sys/fs/bpf", READONLY, true }, + { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ + { "/sys/fs/selinux", READWRITE, true }, + { "/sys/kernel/debug", READONLY, true }, + { "/sys/kernel/tracing", READONLY, true }, +}; + +/* ProtectKernelModules= option */ +static const MountEntry protect_kernel_modules_table[] = { +#if HAVE_SPLIT_USR + { "/lib/modules", INACCESSIBLE, true }, +#endif + { "/usr/lib/modules", INACCESSIBLE, true }, +}; + +/* + * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of + * system should be protected by ProtectSystem= + */ +static const MountEntry protect_home_read_only_table[] = { + { "/home", READONLY, true }, + { "/run/user", READONLY, true }, + { "/root", READONLY, true }, +}; + +/* ProtectHome=tmpfs table */ +static const MountEntry protect_home_tmpfs_table[] = { + { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME }, + { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME }, + { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME }, +}; + +/* ProtectHome=yes table */ +static const MountEntry protect_home_yes_table[] = { + { "/home", INACCESSIBLE, true }, + { "/run/user", INACCESSIBLE, true }, + { "/root", INACCESSIBLE, true }, +}; + +/* ProtectSystem=yes table */ +static const MountEntry protect_system_yes_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, +#if HAVE_SPLIT_USR + { "/lib", READONLY, true }, + { "/lib64", READONLY, true }, + { "/bin", READONLY, true }, +# if HAVE_SPLIT_BIN + { "/sbin", READONLY, true }, +# endif +#endif +}; + +/* ProtectSystem=full includes ProtectSystem=yes */ +static const MountEntry protect_system_full_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, + { "/etc", READONLY, false }, +#if HAVE_SPLIT_USR + { "/lib", READONLY, true }, + { "/lib64", READONLY, true }, + { "/bin", READONLY, true }, +# if HAVE_SPLIT_BIN + { "/sbin", READONLY, true }, +# endif +#endif +}; + +/* + * ProtectSystem=strict table. In this strict mode, we mount everything + * read-only, except for /proc, /dev, /sys which are the kernel API VFS, + * which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. + * (And of course /home and friends are also left writable, as ProtectHome= + * shall manage those, orthogonally). + */ +static const MountEntry protect_system_strict_table[] = { + { "/", READONLY, false }, + { "/proc", READWRITE, false }, /* ProtectKernelTunables= */ + { "/sys", READWRITE, false }, /* ProtectKernelTunables= */ + { "/dev", READWRITE, false }, /* PrivateDevices= */ + { "/home", READWRITE, true }, /* ProtectHome= */ + { "/run/user", READWRITE, true }, /* ProtectHome= */ + { "/root", READWRITE, true }, /* ProtectHome= */ +}; + +static const char *mount_entry_path(const MountEntry *p) { + assert(p); + + /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that, + * otherwise the stack/static ->path field is returned. */ + + return p->path_malloc ?: p->path_const; +} + +static bool mount_entry_read_only(const MountEntry *p) { + assert(p); + + return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE); +} + +static const char *mount_entry_source(const MountEntry *p) { + assert(p); + + return p->source_malloc ?: p->source_const; +} + +static const char *mount_entry_options(const MountEntry *p) { + assert(p); + + return p->options_malloc ?: p->options_const; +} + +static void mount_entry_done(MountEntry *p) { + assert(p); + + p->path_malloc = mfree(p->path_malloc); + p->source_malloc = mfree(p->source_malloc); + p->options_malloc = mfree(p->options_malloc); +} + +static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) { + char **i; + + assert(p); + + /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */ + + STRV_FOREACH(i, strv) { + bool ignore = false, needs_prefix = false; + const char *e = *i; + + /* Look for any prefixes */ + if (startswith(e, "-")) { + e++; + ignore = true; + } + if (startswith(e, "+")) { + e++; + needs_prefix = true; + } + + if (!path_is_absolute(e)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path is not absolute: %s", e); + + *((*p)++) = (MountEntry) { + .path_const = e, + .mode = mode, + .ignore = ignore, + .has_prefix = !needs_prefix && !forcibly_require_prefix, + }; + } + + return 0; +} + +static int append_empty_dir_mounts(MountEntry **p, char **strv) { + char **i; + + assert(p); + + /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the + * "/private/" boundary directories for DynamicUser=1. */ + + STRV_FOREACH(i, strv) { + + *((*p)++) = (MountEntry) { + .path_const = *i, + .mode = EMPTY_DIR, + .ignore = false, + .read_only = true, + .options_const = "mode=755", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + }; + } + + return 0; +} + +static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) { + size_t i; + + assert(p); + + for (i = 0; i < n; i++) { + const BindMount *b = binds + i; + + *((*p)++) = (MountEntry) { + .path_const = b->destination, + .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT, + .read_only = b->read_only, + .source_const = b->source, + .ignore = b->ignore_enoent, + }; + } + + return 0; +} + +static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) { + size_t i; + int r; + + assert(p); + + for (i = 0; i < n; i++) { + const TemporaryFileSystem *t = tmpfs + i; + _cleanup_free_ char *o = NULL, *str = NULL; + unsigned long flags; + bool ro = false; + + if (!path_is_absolute(t->path)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path is not absolute: %s", + t->path); + + str = strjoin("mode=0755,", t->options); + if (!str) + return -ENOMEM; + + r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o); + if (r < 0) + return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); + + ro = flags & MS_RDONLY; + if (ro) + flags ^= MS_RDONLY; + + *((*p)++) = (MountEntry) { + .path_const = t->path, + .mode = TMPFS, + .read_only = ro, + .options_malloc = TAKE_PTR(o), + .flags = flags, + }; + } + + return 0; +} + +static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) { + size_t i; + + assert(p); + assert(mounts); + + /* Adds a list of static pre-defined entries */ + + for (i = 0; i < n; i++) + *((*p)++) = (MountEntry) { + .path_const = mount_entry_path(mounts+i), + .mode = mounts[i].mode, + .ignore = mounts[i].ignore || ignore_protect, + }; + + return 0; +} + +static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) { + assert(p); + + switch (protect_home) { + + case PROTECT_HOME_NO: + return 0; + + case PROTECT_HOME_READ_ONLY: + return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect); + + case PROTECT_HOME_TMPFS: + return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect); + + case PROTECT_HOME_YES: + return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect); + + default: + assert_not_reached("Unexpected ProtectHome= value"); + } +} + +static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) { + assert(p); + + switch (protect_system) { + + case PROTECT_SYSTEM_NO: + return 0; + + case PROTECT_SYSTEM_STRICT: + return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect); + + case PROTECT_SYSTEM_YES: + return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect); + + case PROTECT_SYSTEM_FULL: + return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect); + + default: + assert_not_reached("Unexpected ProtectSystem= value"); + } +} + +static int mount_path_compare(const MountEntry *a, const MountEntry *b) { + int d; + + /* If the paths are not equal, then order prefixes first */ + d = path_compare(mount_entry_path(a), mount_entry_path(b)); + if (d != 0) + return d; + + /* If the paths are equal, check the mode */ + return CMP((int) a->mode, (int) b->mode); +} + +static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) { + size_t i; + + /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */ + + for (i = 0; i < n; i++) { + char *s; + + if (m[i].has_prefix) + continue; + + s = prefix_root(root_directory, mount_entry_path(m+i)); + if (!s) + return -ENOMEM; + + free_and_replace(m[i].path_malloc, s); + m[i].has_prefix = true; + } + + return 0; +} + +static void drop_duplicates(MountEntry *m, size_t *n) { + MountEntry *f, *t, *previous; + + assert(m); + assert(n); + + /* Drops duplicate entries. Expects that the array is properly ordered already. */ + + for (f = m, t = m, previous = NULL; f < m + *n; f++) { + + /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare() + * above. Note that we only drop duplicates that haven't been mounted yet. */ + if (previous && + path_equal(mount_entry_path(f), mount_entry_path(previous)) && + !f->applied && !previous->applied) { + log_debug("%s is duplicate.", mount_entry_path(f)); + previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */ + mount_entry_done(f); + continue; + } + + *t = *f; + previous = t; + t++; + } + + *n = t - m; +} + +static void drop_inaccessible(MountEntry *m, size_t *n) { + MountEntry *f, *t; + const char *clear = NULL; + + assert(m); + assert(n); + + /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly + * ordered already. */ + + for (f = m, t = m; f < m + *n; f++) { + + /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop + * it, as inaccessible paths really should drop the entire subtree. */ + if (clear && path_startswith(mount_entry_path(f), clear)) { + log_debug("%s is masked by %s.", mount_entry_path(f), clear); + mount_entry_done(f); + continue; + } + + clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL; + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_nop(MountEntry *m, size_t *n) { + MountEntry *f, *t; + + assert(m); + assert(n); + + /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the + * list is ordered by prefixes. */ + + for (f = m, t = m; f < m + *n; f++) { + + /* Only suppress such subtrees for READONLY and READWRITE entries */ + if (IN_SET(f->mode, READONLY, READWRITE)) { + MountEntry *p; + bool found = false; + + /* Now let's find the first parent of the entry we are looking at. */ + for (p = t-1; p >= m; p--) { + if (path_startswith(mount_entry_path(f), mount_entry_path(p))) { + found = true; + break; + } + } + + /* We found it, let's see if it's the same mode, if so, we can drop this entry */ + if (found && p->mode == f->mode) { + log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p)); + mount_entry_done(f); + continue; + } + } + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) { + MountEntry *f, *t; + + assert(m); + assert(n); + + /* Nothing to do */ + if (!root_directory) + return; + + /* Drops all mounts that are outside of the root directory. */ + + for (f = m, t = m; f < m + *n; f++) { + + if (!path_startswith(mount_entry_path(f), root_directory)) { + log_debug("%s is outside of root directory.", mount_entry_path(f)); + mount_entry_done(f); + continue; + } + + *t = *f; + t++; + } + + *n = t - m; +} + +static int clone_device_node( + const char *d, + const char *temporary_mount, + bool *make_devnode) { + + _cleanup_free_ char *sl = NULL; + const char *dn, *bn, *t; + struct stat st; + int r; + + if (stat(d, &st) < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d); + return -ENXIO; + } + + return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d); + } + + if (!S_ISBLK(st.st_mode) && + !S_ISCHR(st.st_mode)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Device node '%s' to clone is not a device node, ignoring.", + d); + + dn = strjoina(temporary_mount, d); + + /* First, try to create device node properly */ + if (*make_devnode) { + mac_selinux_create_file_prepare(d, st.st_mode); + r = mknod(dn, st.st_mode, st.st_rdev); + mac_selinux_create_file_clear(); + if (r >= 0) + goto add_symlink; + if (errno != EPERM) + return log_debug_errno(errno, "mknod failed for %s: %m", d); + + /* This didn't work, let's not try this again for the next iterations. */ + *make_devnode = false; + } + + /* We're about to fallback to bind-mounting the device + * node. So create a dummy bind-mount target. */ + mac_selinux_create_file_prepare(d, 0); + r = mknod(dn, S_IFREG, 0); + mac_selinux_create_file_clear(); + if (r < 0 && errno != EEXIST) + return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d); + + /* Fallback to bind-mounting: + * The assumption here is that all used device nodes carry standard + * properties. Specifically, the devices nodes we bind-mount should + * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx) + * and should not carry ACLs. */ + if (mount(d, dn, NULL, MS_BIND, NULL) < 0) + return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d); + +add_symlink: + bn = path_startswith(d, "/dev/"); + if (!bn) + return 0; + + /* Create symlinks like /dev/char/1:9 → ../urandom */ + if (asprintf(&sl, "%s/dev/%s/%u:%u", temporary_mount, S_ISCHR(st.st_mode) ? "char" : "block", major(st.st_rdev), minor(st.st_rdev)) < 0) + return log_oom(); + + (void) mkdir_parents(sl, 0755); + + t = strjoina("../", bn); + + if (symlink(t, sl) < 0) + log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl); + + return 0; +} + +static int mount_private_dev(MountEntry *m) { + static const char devnodes[] = + "/dev/null\0" + "/dev/zero\0" + "/dev/full\0" + "/dev/random\0" + "/dev/urandom\0" + "/dev/tty\0"; + + char temporary_mount[] = "/tmp/namespace-dev-XXXXXX"; + const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL; + bool can_mknod = true; + _cleanup_umask_ mode_t u; + int r; + + assert(m); + + u = umask(0000); + + if (!mkdtemp(temporary_mount)) + return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount); + + dev = strjoina(temporary_mount, "/dev"); + (void) mkdir(dev, 0755); + if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) { + r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev); + goto fail; + } + + devpts = strjoina(temporary_mount, "/dev/pts"); + (void) mkdir(devpts, 0755); + if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) { + r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts); + goto fail; + } + + /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx. + * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible. + * Thus, in that case make a clone. + * In nspawn and other containers it will be a symlink, in that case make it a symlink. */ + r = is_symlink("/dev/ptmx"); + if (r < 0) { + log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m"); + goto fail; + } else if (r > 0) { + devptmx = strjoina(temporary_mount, "/dev/ptmx"); + if (symlink("pts/ptmx", devptmx) < 0) { + r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx); + goto fail; + } + } else { + r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod); + if (r < 0) + goto fail; + } + + devshm = strjoina(temporary_mount, "/dev/shm"); + (void) mkdir(devshm, 0755); + r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL); + if (r < 0) { + r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm); + goto fail; + } + + devmqueue = strjoina(temporary_mount, "/dev/mqueue"); + (void) mkdir(devmqueue, 0755); + if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0) + log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue); + + devhugepages = strjoina(temporary_mount, "/dev/hugepages"); + (void) mkdir(devhugepages, 0755); + if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0) + log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages); + + devlog = strjoina(temporary_mount, "/dev/log"); + if (symlink("/run/systemd/journal/dev-log", devlog) < 0) + log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog); + + NULSTR_FOREACH(d, devnodes) { + r = clone_device_node(d, temporary_mount, &can_mknod); + /* ENXIO means the the *source* is not a device file, skip creation in that case */ + if (r < 0 && r != -ENXIO) + goto fail; + } + + r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID); + if (r < 0) + log_debug_errno(r, "Failed to setup basic device tree at '%s', ignoring: %m", temporary_mount); + + /* Create the /dev directory if missing. It is more likely to be + * missing when the service is started with RootDirectory. This is + * consistent with mount units creating the mount points when missing. + */ + (void) mkdir_p_label(mount_entry_path(m), 0755); + + /* Unmount everything in old /dev */ + r = umount_recursive(mount_entry_path(m), 0); + if (r < 0) + log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m)); + + if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) { + r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m)); + goto fail; + } + + rmdir(dev); + rmdir(temporary_mount); + + return 0; + +fail: + if (devpts) + umount(devpts); + + if (devshm) + umount(devshm); + + if (devhugepages) + umount(devhugepages); + + if (devmqueue) + umount(devmqueue); + + umount(dev); + rmdir(dev); + rmdir(temporary_mount); + + return r; +} + +static int mount_bind_dev(const MountEntry *m) { + int r; + + assert(m); + + /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's + * /dev. This is only used when RootDirectory= is set. */ + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); + if (r > 0) /* make this a NOP if /dev is already a mount point */ + return 0; + + if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m)); + + return 1; +} + +static int mount_sysfs(const MountEntry *m) { + int r; + + assert(m); + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); + if (r > 0) /* make this a NOP if /sys is already a mount point */ + return 0; + + /* Bind mount the host's version so that we get all child mounts of it, too. */ + if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + +static int mount_procfs(const MountEntry *m) { + int r; + + assert(m); + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m"); + if (r > 0) /* make this a NOP if /proc is already a mount point */ + return 0; + + /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */ + if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + +static int mount_tmpfs(const MountEntry *m) { + assert(m); + + /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */ + + (void) mkdir_p_label(mount_entry_path(m), 0755); + (void) umount_recursive(mount_entry_path(m), 0); + + if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + +static int follow_symlink( + const char *root_directory, + MountEntry *m) { + + _cleanup_free_ char *target = NULL; + int r; + + /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we + * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at + * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the + * end and already have a fully normalized name. */ + + r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target); + if (r < 0) + return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m)); + if (r > 0) /* Reached the end, nothing more to resolve */ + return 1; + + if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */ + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "Symlink loop on '%s'.", + mount_entry_path(m)); + + log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target); + + free_and_replace(m->path_malloc, target); + m->has_prefix = true; + + m->n_followed ++; + + return 0; +} + +static int apply_mount( + const char *root_directory, + MountEntry *m) { + + bool rbind = true, make = false; + const char *what; + int r; + + assert(m); + + log_debug("Applying namespace mount on %s", mount_entry_path(m)); + + switch (m->mode) { + + case INACCESSIBLE: { + struct stat target; + + /* First, get rid of everything that is below if there + * is anything... Then, overmount it with an + * inaccessible path. */ + (void) umount_recursive(mount_entry_path(m), 0); + + if (lstat(mount_entry_path(m), &target) < 0) { + if (errno == ENOENT && m->ignore) + return 0; + + return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m)); + } + + what = mode_to_inaccessible_node(target.st_mode); + if (!what) + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "File type not supported for inaccessible mounts. Note that symlinks are not allowed"); + break; + } + + case READONLY: + case READWRITE: + r = path_is_mount_point(mount_entry_path(m), root_directory, 0); + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m)); + if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */ + return 0; + /* This isn't a mount point yet, let's make it one. */ + what = mount_entry_path(m); + break; + + case BIND_MOUNT: + rbind = false; + + _fallthrough_; + case BIND_MOUNT_RECURSIVE: { + _cleanup_free_ char *chased = NULL; + + /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind + * mount source paths are always relative to the host root, hence we pass NULL as root directory to + * chase_symlinks() here. */ + + r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased); + if (r == -ENOENT && m->ignore) { + log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m)); + return 0; + } + if (r < 0) + return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m)); + + log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased); + + free_and_replace(m->source_malloc, chased); + + what = mount_entry_source(m); + make = true; + break; + } + + case EMPTY_DIR: + case TMPFS: + return mount_tmpfs(m); + + case PRIVATE_TMP: + what = mount_entry_source(m); + make = true; + break; + + case PRIVATE_DEV: + return mount_private_dev(m); + + case BIND_DEV: + return mount_bind_dev(m); + + case SYSFS: + return mount_sysfs(m); + + case PROCFS: + return mount_procfs(m); + + default: + assert_not_reached("Unknown mode"); + } + + assert(what); + + if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) { + bool try_again = false; + r = -errno; + + if (r == -ENOENT && make) { + struct stat st; + + /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */ + + if (stat(what, &st) < 0) + log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what); + else { + int q; + + (void) mkdir_parents(mount_entry_path(m), 0755); + + if (S_ISDIR(st.st_mode)) + q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0; + else + q = touch(mount_entry_path(m)); + + if (q < 0) + log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m)); + else + try_again = true; + } + } + + if (try_again) { + if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) + r = -errno; + else + r = 0; + } + + if (r < 0) + return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); + } + + log_debug("Successfully mounted %s to %s", what, mount_entry_path(m)); + return 0; +} + +/* Change the per-mount readonly flag on an existing mount */ +static int remount_bind_readonly(const char *path, unsigned long orig_flags) { + int r; + + r = mount(NULL, path, NULL, MS_REMOUNT | MS_BIND | MS_RDONLY | orig_flags, NULL); + + return r < 0 ? -errno : 0; +} + +static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) { + bool submounts = false; + int r = 0; + + assert(m); + assert(proc_self_mountinfo); + + if (mount_entry_read_only(m)) { + if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) { + r = remount_bind_readonly(mount_entry_path(m), m->flags); + } else { + submounts = true; + r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo); + } + } else if (m->mode == PRIVATE_DEV) { + /* Set /dev readonly, but not submounts like /dev/shm. Also, we only set the per-mount read-only flag. + * We can't set it on the superblock, if we are inside a user namespace and running Linux <= 4.17. */ + r = remount_bind_readonly(mount_entry_path(m), DEV_MOUNT_OPTIONS); + } else + return 0; + + /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only + * already stays this way. This improves compatibility with container managers, where we won't attempt to undo + * read-only mounts already applied. */ + + if (r == -ENOENT && m->ignore) + r = 0; + + if (r < 0) + return log_debug_errno(r, "Failed to re-mount '%s'%s read-only: %m", mount_entry_path(m), + submounts ? " and its submounts" : ""); + + return 0; +} + +static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { + assert(ns_info); + + /* + * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, + * since to protect the API VFS mounts, they need to be around in the + * first place... + */ + + return ns_info->mount_apivfs || + ns_info->protect_control_groups || + ns_info->protect_kernel_tunables; +} + +static size_t namespace_calculate_mounts( + const NamespaceInfo *ns_info, + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, + char** empty_directories, + size_t n_bind_mounts, + size_t n_temporary_filesystems, + const char* tmp_dir, + const char* var_tmp_dir, + ProtectHome protect_home, + ProtectSystem protect_system) { + + size_t protect_home_cnt; + size_t protect_system_cnt = + (protect_system == PROTECT_SYSTEM_STRICT ? + ELEMENTSOF(protect_system_strict_table) : + ((protect_system == PROTECT_SYSTEM_FULL) ? + ELEMENTSOF(protect_system_full_table) : + ((protect_system == PROTECT_SYSTEM_YES) ? + ELEMENTSOF(protect_system_yes_table) : 0))); + + protect_home_cnt = + (protect_home == PROTECT_HOME_YES ? + ELEMENTSOF(protect_home_yes_table) : + ((protect_home == PROTECT_HOME_READ_ONLY) ? + ELEMENTSOF(protect_home_read_only_table) : + ((protect_home == PROTECT_HOME_TMPFS) ? + ELEMENTSOF(protect_home_tmpfs_table) : 0))); + + return !!tmp_dir + !!var_tmp_dir + + strv_length(read_write_paths) + + strv_length(read_only_paths) + + strv_length(inaccessible_paths) + + strv_length(empty_directories) + + n_bind_mounts + + n_temporary_filesystems + + ns_info->private_dev + + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (ns_info->protect_control_groups ? 1 : 0) + + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + + protect_home_cnt + protect_system_cnt + + (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0); +} + +static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) { + assert(root_directory); + assert(n_mounts); + assert(mounts || *n_mounts == 0); + + typesafe_qsort(mounts, *n_mounts, mount_path_compare); + + drop_duplicates(mounts, n_mounts); + drop_outside_root(root_directory, mounts, n_mounts); + drop_inaccessible(mounts, n_mounts); + drop_nop(mounts, n_mounts); +} + +int setup_namespace( + const char* root_directory, + const char* root_image, + const NamespaceInfo *ns_info, + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, + char** empty_directories, + const BindMount *bind_mounts, + size_t n_bind_mounts, + const TemporaryFileSystem *temporary_filesystems, + size_t n_temporary_filesystems, + const char* tmp_dir, + const char* var_tmp_dir, + ProtectHome protect_home, + ProtectSystem protect_system, + unsigned long mount_flags, + DissectImageFlags dissect_image_flags) { + + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_free_ void *root_hash = NULL; + MountEntry *m, *mounts = NULL; + size_t n_mounts, root_hash_size = 0; + bool require_prefix = false; + const char *root; + int r = 0; + + assert(ns_info); + + if (mount_flags == 0) + mount_flags = MS_SHARED; + + if (root_image) { + dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT; + + if (protect_system == PROTECT_SYSTEM_STRICT && + protect_home != PROTECT_HOME_NO && + strv_isempty(read_write_paths)) + dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; + + r = loop_device_make_by_path(root_image, + dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR, + &loop_device); + if (r < 0) + return log_debug_errno(r, "Failed to create loop device for root image: %m"); + + r = root_hash_load(root_image, &root_hash, &root_hash_size); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); + + r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + } + + if (root_directory) + root = root_directory; + else { + /* Always create the mount namespace in a temporary directory, instead of operating + * directly in the root. The temporary directory prevents any mounts from being + * potentially obscured my other mounts we already applied. + * We use the same mount point for all images, which is safe, since they all live + * in their own namespaces after all, and hence won't see each other. */ + + root = "/run/systemd/unit-root"; + (void) mkdir_label(root, 0700); + require_prefix = true; + } + + n_mounts = namespace_calculate_mounts( + ns_info, + read_write_paths, + read_only_paths, + inaccessible_paths, + empty_directories, + n_bind_mounts, + n_temporary_filesystems, + tmp_dir, var_tmp_dir, + protect_home, protect_system); + + if (n_mounts > 0) { + m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry)); + r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix); + if (r < 0) + goto finish; + + r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix); + if (r < 0) + goto finish; + + r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix); + if (r < 0) + goto finish; + + r = append_empty_dir_mounts(&m, empty_directories); + if (r < 0) + goto finish; + + r = append_bind_mounts(&m, bind_mounts, n_bind_mounts); + if (r < 0) + goto finish; + + r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems); + if (r < 0) + goto finish; + + if (tmp_dir) { + *(m++) = (MountEntry) { + .path_const = "/tmp", + .mode = PRIVATE_TMP, + .source_const = tmp_dir, + }; + } + + if (var_tmp_dir) { + *(m++) = (MountEntry) { + .path_const = "/var/tmp", + .mode = PRIVATE_TMP, + .source_const = var_tmp_dir, + }; + } + + if (ns_info->private_dev) { + *(m++) = (MountEntry) { + .path_const = "/dev", + .mode = PRIVATE_DEV, + }; + } + + if (ns_info->protect_kernel_tunables) { + r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + if (ns_info->protect_kernel_modules) { + r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + if (ns_info->protect_control_groups) { + *(m++) = (MountEntry) { + .path_const = "/sys/fs/cgroup", + .mode = READONLY, + }; + } + + r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + + r = append_protect_system(&m, protect_system, false); + if (r < 0) + goto finish; + + if (namespace_info_mount_apivfs(ns_info)) { + r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + assert(mounts + n_mounts == m); + + /* Prepend the root directory where that's necessary */ + r = prefix_where_needed(mounts, n_mounts, root); + if (r < 0) + goto finish; + + normalize_mounts(root, mounts, &n_mounts); + } + + /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */ + + if (unshare(CLONE_NEWNS) < 0) { + r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m"); + if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS)) + /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place + * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable + * error back, which the caller can use to detect this case (and only this) and optionally + * continue without namespacing applied. */ + r = -ENOANO; + + goto finish; + } + + /* Remount / as SLAVE so that nothing now mounted in the namespace + * shows up in the parent */ + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { + r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m"); + goto finish; + } + + if (root_image) { + /* A root image is specified, mount it to the right place */ + r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags); + if (r < 0) { + log_debug_errno(r, "Failed to mount root image: %m"); + goto finish; + } + + if (decrypted_image) { + r = decrypted_image_relinquish(decrypted_image); + if (r < 0) { + log_debug_errno(r, "Failed to relinquish decrypted image: %m"); + goto finish; + } + } + + loop_device_relinquish(loop_device); + + } else if (root_directory) { + + /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */ + r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW); + if (r < 0) { + log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root); + goto finish; + } + if (r == 0) { + if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) { + r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root); + goto finish; + } + } + + } else { + + /* Let's mount the main root directory to the root directory to use */ + if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) { + r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root); + goto finish; + } + } + + /* Try to set up the new root directory before mounting anything else there. */ + if (root_image || root_directory) + (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); + + if (n_mounts > 0) { + _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; + char **blacklist; + size_t j; + + /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc. + * For example, this is the case with the option: 'InaccessiblePaths=/proc' */ + proc_self_mountinfo = fopen("/proc/self/mountinfo", "re"); + if (!proc_self_mountinfo) { + r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m"); + goto finish; + } + + /* First round, establish all mounts we need */ + for (;;) { + bool again = false; + + for (m = mounts; m < mounts + n_mounts; ++m) { + + if (m->applied) + continue; + + r = follow_symlink(root, m); + if (r < 0) + goto finish; + if (r == 0) { + /* We hit a symlinked mount point. The entry got rewritten and might point to a + * very different place now. Let's normalize the changed list, and start from + * the beginning. After all to mount the entry at the new location we might + * need some other mounts first */ + again = true; + break; + } + + r = apply_mount(root, m); + if (r < 0) + goto finish; + + m->applied = true; + } + + if (!again) + break; + + normalize_mounts(root, mounts, &n_mounts); + } + + /* Create a blacklist we can pass to bind_mount_recursive() */ + blacklist = newa(char*, n_mounts+1); + for (j = 0; j < n_mounts; j++) + blacklist[j] = (char*) mount_entry_path(mounts+j); + blacklist[j] = NULL; + + /* Second round, flip the ro bits if necessary. */ + for (m = mounts; m < mounts + n_mounts; ++m) { + r = make_read_only(m, blacklist, proc_self_mountinfo); + if (r < 0) + goto finish; + } + } + + /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ + r = mount_move_root(root); + if (r < 0) { + log_debug_errno(r, "Failed to mount root with MS_MOVE: %m"); + goto finish; + } + + /* Remount / as the desired mode. Note that this will not + * reestablish propagation from our side to the host, since + * what's disconnected is disconnected. */ + if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { + r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m"); + goto finish; + } + + r = 0; + +finish: + for (m = mounts; m < mounts + n_mounts; m++) + mount_entry_done(m); + + return r; +} + +void bind_mount_free_many(BindMount *b, size_t n) { + size_t i; + + assert(b || n == 0); + + for (i = 0; i < n; i++) { + free(b[i].source); + free(b[i].destination); + } + + free(b); +} + +int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) { + _cleanup_free_ char *s = NULL, *d = NULL; + BindMount *c; + + assert(b); + assert(n); + assert(item); + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + d = strdup(item->destination); + if (!d) + return -ENOMEM; + + c = reallocarray(*b, *n + 1, sizeof(BindMount)); + if (!c) + return -ENOMEM; + + *b = c; + + c[(*n) ++] = (BindMount) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .read_only = item->read_only, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + + return 0; +} + +void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) { + size_t i; + + assert(t || n == 0); + + for (i = 0; i < n; i++) { + free(t[i].path); + free(t[i].options); + } + + free(t); +} + +int temporary_filesystem_add( + TemporaryFileSystem **t, + size_t *n, + const char *path, + const char *options) { + + _cleanup_free_ char *p = NULL, *o = NULL; + TemporaryFileSystem *c; + + assert(t); + assert(n); + assert(path); + + p = strdup(path); + if (!p) + return -ENOMEM; + + if (!isempty(options)) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem)); + if (!c) + return -ENOMEM; + + *t = c; + + c[(*n) ++] = (TemporaryFileSystem) { + .path = TAKE_PTR(p), + .options = TAKE_PTR(o), + }; + + return 0; +} + +static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) { + _cleanup_free_ char *x = NULL; + char bid[SD_ID128_STRING_MAX]; + sd_id128_t boot_id; + int r; + + assert(id); + assert(prefix); + assert(path); + + /* We include the boot id in the directory so that after a + * reboot we can easily identify obsolete directories. */ + + r = sd_id128_get_boot(&boot_id); + if (r < 0) + return r; + + x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX"); + if (!x) + return -ENOMEM; + + RUN_WITH_UMASK(0077) + if (!mkdtemp(x)) + return -errno; + + RUN_WITH_UMASK(0000) { + char *y; + + y = strjoina(x, "/tmp"); + + if (mkdir(y, 0777 | S_ISVTX) < 0) + return -errno; + } + + *path = TAKE_PTR(x); + + return 0; +} + +int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { + char *a, *b; + int r; + + assert(id); + assert(tmp_dir); + assert(var_tmp_dir); + + r = setup_one_tmp_dir(id, "/tmp", &a); + if (r < 0) + return r; + + r = setup_one_tmp_dir(id, "/var/tmp", &b); + if (r < 0) { + char *t; + + t = strjoina(a, "/tmp"); + rmdir(t); + rmdir(a); + + free(a); + return r; + } + + *tmp_dir = a; + *var_tmp_dir = b; + + return 0; +} + +int setup_netns(int netns_storage_socket[static 2]) { + _cleanup_close_ int netns = -1; + int r, q; + + assert(netns_storage_socket); + assert(netns_storage_socket[0] >= 0); + assert(netns_storage_socket[1] >= 0); + + /* We use the passed socketpair as a storage buffer for our + * namespace reference fd. Whatever process runs this first + * shall create a new namespace, all others should just join + * it. To serialize that we use a file lock on the socket + * pair. + * + * It's a bit crazy, but hey, works great! */ + + if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) + return -errno; + + netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); + if (netns == -EAGAIN) { + /* Nothing stored yet, so let's create a new namespace */ + + if (unshare(CLONE_NEWNET) < 0) { + r = -errno; + goto fail; + } + + loopback_setup(); + + netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (netns < 0) { + r = -errno; + goto fail; + } + + r = 1; + + } else if (netns < 0) { + r = netns; + goto fail; + + } else { + /* Yay, found something, so let's join the namespace */ + if (setns(netns, CLONE_NEWNET) < 0) { + r = -errno; + goto fail; + } + + r = 0; + } + + q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); + if (q < 0) { + r = q; + goto fail; + } + +fail: + (void) lockf(netns_storage_socket[0], F_ULOCK, 0); + return r; +} + +bool ns_type_supported(NamespaceType type) { + const char *t, *ns_proc; + + t = namespace_type_to_string(type); + if (!t) /* Don't know how to translate this? Then it's not supported */ + return false; + + ns_proc = strjoina("/proc/self/ns/", t); + return access(ns_proc, F_OK) == 0; +} + +static const char *const protect_home_table[_PROTECT_HOME_MAX] = { + [PROTECT_HOME_NO] = "no", + [PROTECT_HOME_YES] = "yes", + [PROTECT_HOME_READ_ONLY] = "read-only", + [PROTECT_HOME_TMPFS] = "tmpfs", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES); + +static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { + [PROTECT_SYSTEM_NO] = "no", + [PROTECT_SYSTEM_YES] = "yes", + [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES); + +static const char* const namespace_type_table[] = { + [NAMESPACE_MOUNT] = "mnt", + [NAMESPACE_CGROUP] = "cgroup", + [NAMESPACE_UTS] = "uts", + [NAMESPACE_IPC] = "ipc", + [NAMESPACE_USER] = "user", + [NAMESPACE_PID] = "pid", + [NAMESPACE_NET] = "net", +}; + +DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType); diff --git a/src/core/namespace.h b/src/core/namespace.h new file mode 100644 index 0000000..5e0ec97 --- /dev/null +++ b/src/core/namespace.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +/*** + Copyright © 2016 Djalal Harouni +***/ + +typedef struct NamespaceInfo NamespaceInfo; +typedef struct BindMount BindMount; +typedef struct TemporaryFileSystem TemporaryFileSystem; + +#include <stdbool.h> + +#include "dissect-image.h" +#include "macro.h" + +typedef enum ProtectHome { + PROTECT_HOME_NO, + PROTECT_HOME_YES, + PROTECT_HOME_READ_ONLY, + PROTECT_HOME_TMPFS, + _PROTECT_HOME_MAX, + _PROTECT_HOME_INVALID = -1 +} ProtectHome; + +typedef enum NamespaceType { + NAMESPACE_MOUNT, + NAMESPACE_CGROUP, + NAMESPACE_UTS, + NAMESPACE_IPC, + NAMESPACE_USER, + NAMESPACE_PID, + NAMESPACE_NET, + _NAMESPACE_TYPE_MAX, + _NAMESPACE_TYPE_INVALID = -1, +} NamespaceType; + +typedef enum ProtectSystem { + PROTECT_SYSTEM_NO, + PROTECT_SYSTEM_YES, + PROTECT_SYSTEM_FULL, + PROTECT_SYSTEM_STRICT, + _PROTECT_SYSTEM_MAX, + _PROTECT_SYSTEM_INVALID = -1 +} ProtectSystem; + +struct NamespaceInfo { + bool ignore_protect_paths:1; + bool private_dev:1; + bool private_mounts:1; + bool protect_control_groups:1; + bool protect_kernel_tunables:1; + bool protect_kernel_modules:1; + bool mount_apivfs:1; +}; + +struct BindMount { + char *source; + char *destination; + bool read_only:1; + bool recursive:1; + bool ignore_enoent:1; +}; + +struct TemporaryFileSystem { + char *path; + char *options; +}; + +int setup_namespace( + const char *root_directory, + const char *root_image, + const NamespaceInfo *ns_info, + char **read_write_paths, + char **read_only_paths, + char **inaccessible_paths, + char **empty_directories, + const BindMount *bind_mounts, + size_t n_bind_mounts, + const TemporaryFileSystem *temporary_filesystems, + size_t n_temporary_filesystems, + const char *tmp_dir, + const char *var_tmp_dir, + ProtectHome protect_home, + ProtectSystem protect_system, + unsigned long mount_flags, + DissectImageFlags dissected_image_flags); + +int setup_tmp_dirs( + const char *id, + char **tmp_dir, + char **var_tmp_dir); + +int setup_netns(int netns_storage_socket[static 2]); + +const char* protect_home_to_string(ProtectHome p) _const_; +ProtectHome protect_home_from_string(const char *s) _pure_; + +const char* protect_system_to_string(ProtectSystem p) _const_; +ProtectSystem protect_system_from_string(const char *s) _pure_; + +void bind_mount_free_many(BindMount *b, size_t n); +int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); + +void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n); +int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n, + const char *path, const char *options); + +const char* namespace_type_to_string(NamespaceType t) _const_; +NamespaceType namespace_type_from_string(const char *s) _pure_; + +bool ns_type_supported(NamespaceType type); diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf new file mode 100644 index 0000000..415b3f5 --- /dev/null +++ b/src/core/org.freedesktop.systemd1.conf @@ -0,0 +1,400 @@ +<?xml version="1.0"?> <!--*-nxml-*--> +<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN" + "http://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd"> + +<!-- + SPDX-License-Identifier: LGPL-2.1+ + + This file is part of systemd. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +--> + +<busconfig> + + <policy user="root"> + <allow own="org.freedesktop.systemd1"/> + + <!-- Root clients can do everything --> + <allow send_destination="org.freedesktop.systemd1"/> + <allow receive_sender="org.freedesktop.systemd1"/> + + <!-- systemd may receive activator requests --> + <allow receive_interface="org.freedesktop.systemd1.Activator" + receive_member="ActivationRequest"/> + </policy> + + <policy context="default"> + <deny send_destination="org.freedesktop.systemd1"/> + + <!-- Completely open to anyone: org.freedesktop.DBus.* interfaces --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Introspectable"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Peer"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Properties" + send_member="Get"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Properties" + send_member="GetAll"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Manager interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitByPID"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitByInvocationID"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitByControlGroup"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LoadUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitProcesses"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetJob"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetJobAfter"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetJobBefore"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnits"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitsFiltered"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitsByPatterns"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitsByNames"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListJobs"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Subscribe"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Unsubscribe"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Dump"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="DumpByFileDescriptor"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitFilesByPatterns"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitFileState"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetDefaultTarget"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitFileLinks"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LookupDynamicUserByName"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LookupDynamicUserByUID"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetDynamicUsers"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Unit interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Service" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Slice interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Slice" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Scope interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Scope" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Socket interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Socket" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Mount interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Mount" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Swap interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Swap" + send_member="GetProcesses"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Manager interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StartUnitReplace"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StopUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReloadUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="RestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="TryRestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReloadOrRestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReloadOrTryRestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="KillUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ResetFailedUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="SetUnitProperties"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="RefUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="UnrefUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StartTransientUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="AttachProcessesToUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="CancelJob"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ClearJobs"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ResetFailed"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Reload"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Reexecute"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="EnableUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="DisableUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReenableUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LinkUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="PresetUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="PresetUnitFilesWithMode"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="MaskUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="UnmaskUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="RevertUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="SetDefaultTarget"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="PresetAllUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="AddDependencyUnitFiles"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Job interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Job" + send_member="Cancel"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Job" + send_member="GetAfter"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Job" + send_member="GetBefore"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Unit interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Start"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Stop"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Reload"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Restart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="TryRestart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="ReloadOrRestart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="ReloadOrTryRestart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Kill"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="ResetFailed"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="SetProperties"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Ref"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Unref"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Service interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Service" + send_member="AttachProcesses"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Scope interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Scope" + send_member="AttachProcesses"/> + + <allow receive_sender="org.freedesktop.systemd1"/> + </policy> + +</busconfig> diff --git a/src/core/org.freedesktop.systemd1.policy.in b/src/core/org.freedesktop.systemd1.policy.in new file mode 100644 index 0000000..001408d --- /dev/null +++ b/src/core/org.freedesktop.systemd1.policy.in @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*--> +<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN" + "http://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd"> + +<!-- + SPDX-License-Identifier: LGPL-2.1+ + + This file is part of systemd. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +--> + +<policyconfig> + + <vendor>The systemd Project</vendor> + <vendor_url>http://www.freedesktop.org/wiki/Software/systemd</vendor_url> + + <action id="org.freedesktop.systemd1.reply-password"> + <description gettext-domain="systemd">Send passphrase back to system</description> + <message gettext-domain="systemd">Authentication is required to send the entered passphrase back to the system.</message> + <defaults> + <allow_any>no</allow_any> + <allow_inactive>no</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + <annotate key="org.freedesktop.policykit.exec.path">@rootlibexecdir@/systemd-reply-password</annotate> + </action> + + <action id="org.freedesktop.systemd1.manage-units"> + <description gettext-domain="systemd">Manage system services or other units</description> + <message gettext-domain="systemd">Authentication is required to manage system services or other units.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + </action> + + <action id="org.freedesktop.systemd1.manage-unit-files"> + <description gettext-domain="systemd">Manage system service or unit files</description> + <message gettext-domain="systemd">Authentication is required to manage system service or unit files.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + <annotate key="org.freedesktop.policykit.imply">org.freedesktop.systemd1.reload-daemon org.freedesktop.systemd1.manage-units</annotate> + </action> + + <action id="org.freedesktop.systemd1.set-environment"> + <description gettext-domain="systemd">Set or unset system and service manager environment variables</description> + <message gettext-domain="systemd">Authentication is required to set or unset system and service manager environment variables.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + </action> + + <action id="org.freedesktop.systemd1.reload-daemon"> + <description gettext-domain="systemd">Reload the systemd state</description> + <message gettext-domain="systemd">Authentication is required to reload the systemd state.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + </action> + +</policyconfig> diff --git a/src/core/org.freedesktop.systemd1.service b/src/core/org.freedesktop.systemd1.service new file mode 100644 index 0000000..8bd7302 --- /dev/null +++ b/src/core/org.freedesktop.systemd1.service @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.systemd1 +Exec=/bin/false +User=root diff --git a/src/core/path.c b/src/core/path.c new file mode 100644 index 0000000..831e49d --- /dev/null +++ b/src/core/path.c @@ -0,0 +1,787 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <sys/epoll.h> +#include <sys/inotify.h> +#include <unistd.h> + +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-path.h" +#include "dbus-unit.h" +#include "fd-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "macro.h" +#include "mkdir.h" +#include "path.h" +#include "serialize.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_PATH_STATE_MAX] = { + [PATH_DEAD] = UNIT_INACTIVE, + [PATH_WAITING] = UNIT_ACTIVE, + [PATH_RUNNING] = UNIT_ACTIVE, + [PATH_FAILED] = UNIT_FAILED +}; + +static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); + +int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) { + + static const int flags_table[_PATH_TYPE_MAX] = { + [PATH_EXISTS] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB, + [PATH_EXISTS_GLOB] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB, + [PATH_CHANGED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO, + [PATH_MODIFIED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO|IN_MODIFY, + [PATH_DIRECTORY_NOT_EMPTY] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO + }; + + bool exists = false; + char *slash, *oldslash = NULL; + int r; + + assert(s); + assert(s->unit); + assert(handler); + + path_spec_unwatch(s); + + s->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (s->inotify_fd < 0) { + r = -errno; + goto fail; + } + + r = sd_event_add_io(s->unit->manager->event, &s->event_source, s->inotify_fd, EPOLLIN, handler, s); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(s->event_source, "path"); + + /* This function assumes the path was passed through path_simplify()! */ + assert(!strstr(s->path, "//")); + + for (slash = strchr(s->path, '/'); ; slash = strchr(slash+1, '/')) { + char *cut = NULL; + int flags; + char tmp; + + if (slash) { + cut = slash + (slash == s->path); + tmp = *cut; + *cut = '\0'; + + flags = IN_MOVE_SELF | IN_DELETE_SELF | IN_ATTRIB | IN_CREATE | IN_MOVED_TO; + } else + flags = flags_table[s->type]; + + r = inotify_add_watch(s->inotify_fd, s->path, flags); + if (r < 0) { + if (IN_SET(errno, EACCES, ENOENT)) { + if (cut) + *cut = tmp; + break; + } + + r = log_warning_errno(errno, "Failed to add watch on %s: %s", s->path, errno == ENOSPC ? "too many watches" : strerror(-r)); + if (cut) + *cut = tmp; + goto fail; + } else { + exists = true; + + /* Path exists, we don't need to watch parent too closely. */ + if (oldslash) { + char *cut2 = oldslash + (oldslash == s->path); + char tmp2 = *cut2; + *cut2 = '\0'; + + (void) inotify_add_watch(s->inotify_fd, s->path, IN_MOVE_SELF); + /* Error is ignored, the worst can happen is we get spurious events. */ + + *cut2 = tmp2; + } + } + + if (cut) + *cut = tmp; + + if (slash) + oldslash = slash; + else { + /* whole path has been iterated over */ + s->primary_wd = r; + break; + } + } + + if (!exists) { + r = log_error_errno(errno, "Failed to add watch on any of the components of %s: %m", s->path); + /* either EACCESS or ENOENT */ + goto fail; + } + + return 0; + +fail: + path_spec_unwatch(s); + return r; +} + +void path_spec_unwatch(PathSpec *s) { + assert(s); + + s->event_source = sd_event_source_unref(s->event_source); + s->inotify_fd = safe_close(s->inotify_fd); +} + +int path_spec_fd_event(PathSpec *s, uint32_t revents) { + union inotify_event_buffer buffer; + struct inotify_event *e; + ssize_t l; + int r = 0; + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Got invalid poll event on inotify."); + + l = read(s->inotify_fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (IN_SET(errno, EAGAIN, EINTR)) + return 0; + + return log_error_errno(errno, "Failed to read inotify event: %m"); + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + if (IN_SET(s->type, PATH_CHANGED, PATH_MODIFIED) && + s->primary_wd == e->wd) + r = 1; + } + + return r; +} + +static bool path_spec_check_good(PathSpec *s, bool initial) { + bool good = false; + + switch (s->type) { + + case PATH_EXISTS: + good = access(s->path, F_OK) >= 0; + break; + + case PATH_EXISTS_GLOB: + good = glob_exists(s->path) > 0; + break; + + case PATH_DIRECTORY_NOT_EMPTY: { + int k; + + k = dir_is_empty(s->path); + good = !(k == -ENOENT || k > 0); + break; + } + + case PATH_CHANGED: + case PATH_MODIFIED: { + bool b; + + b = access(s->path, F_OK) >= 0; + good = !initial && b != s->previous_exists; + s->previous_exists = b; + break; + } + + default: + ; + } + + return good; +} + +static void path_spec_mkdir(PathSpec *s, mode_t mode) { + int r; + + if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB)) + return; + + r = mkdir_p_label(s->path, mode); + if (r < 0) + log_warning_errno(r, "mkdir(%s) failed: %m", s->path); +} + +static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) { + fprintf(f, + "%s%s: %s\n", + prefix, + path_type_to_string(s->type), + s->path); +} + +void path_spec_done(PathSpec *s) { + assert(s); + assert(s->inotify_fd == -1); + + free(s->path); +} + +static void path_init(Unit *u) { + Path *p = PATH(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + p->directory_mode = 0755; +} + +void path_free_specs(Path *p) { + PathSpec *s; + + assert(p); + + while ((s = p->specs)) { + path_spec_unwatch(s); + LIST_REMOVE(spec, p->specs, s); + path_spec_done(s); + free(s); + } +} + +static void path_done(Unit *u) { + Path *p = PATH(u); + + assert(p); + + path_free_specs(p); +} + +static int path_add_mount_dependencies(Path *p) { + PathSpec *s; + int r; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + return 0; +} + +static int path_verify(Path *p) { + assert(p); + + if (UNIT(p)->load_state != UNIT_LOADED) + return 0; + + if (!p->specs) { + log_unit_error(UNIT(p), "Path unit lacks path setting. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int path_add_default_dependencies(Path *p) { + int r; + + assert(p); + + if (!UNIT(p)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(p), UNIT_BEFORE, SPECIAL_PATHS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(p)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(p), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(p), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int path_add_trigger_dependencies(Path *p) { + Unit *x; + int r; + + assert(p); + + if (!hashmap_isempty(UNIT(p)->dependencies[UNIT_TRIGGERS])) + return 0; + + r = unit_load_related_unit(UNIT(p), ".service", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(p), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int path_load(Unit *u) { + Path *p = PATH(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_LOADED) { + + r = path_add_trigger_dependencies(p); + if (r < 0) + return r; + + r = path_add_mount_dependencies(p); + if (r < 0) + return r; + + r = path_add_default_dependencies(p); + if (r < 0) + return r; + } + + return path_verify(p); +} + +static void path_dump(Unit *u, FILE *f, const char *prefix) { + Path *p = PATH(u); + Unit *trigger; + PathSpec *s; + + assert(p); + assert(f); + + trigger = UNIT_TRIGGER(u); + + fprintf(f, + "%sPath State: %s\n" + "%sResult: %s\n" + "%sUnit: %s\n" + "%sMakeDirectory: %s\n" + "%sDirectoryMode: %04o\n", + prefix, path_state_to_string(p->state), + prefix, path_result_to_string(p->result), + prefix, trigger ? trigger->id : "n/a", + prefix, yes_no(p->make_directory), + prefix, p->directory_mode); + + LIST_FOREACH(spec, s, p->specs) + path_spec_dump(s, f, prefix); +} + +static void path_unwatch(Path *p) { + PathSpec *s; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) + path_spec_unwatch(s); +} + +static int path_watch(Path *p) { + int r; + PathSpec *s; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + r = path_spec_watch(s, path_dispatch_io); + if (r < 0) + return r; + } + + return 0; +} + +static void path_set_state(Path *p, PathState state) { + PathState old_state; + assert(p); + + if (p->state != state) + bus_unit_send_pending_change_signal(UNIT(p), false); + + old_state = p->state; + p->state = state; + + if (state != PATH_WAITING && + (state != PATH_RUNNING || p->inotify_triggered)) + path_unwatch(p); + + if (state != old_state) + log_unit_debug(UNIT(p), "Changed %s -> %s", path_state_to_string(old_state), path_state_to_string(state)); + + unit_notify(UNIT(p), state_translation_table[old_state], state_translation_table[state], 0); +} + +static void path_enter_waiting(Path *p, bool initial, bool recheck); + +static int path_coldplug(Unit *u) { + Path *p = PATH(u); + + assert(p); + assert(p->state == PATH_DEAD); + + if (p->deserialized_state != p->state) { + + if (IN_SET(p->deserialized_state, PATH_WAITING, PATH_RUNNING)) + path_enter_waiting(p, true, true); + else + path_set_state(p, p->deserialized_state); + } + + return 0; +} + +static void path_enter_dead(Path *p, PathResult f) { + assert(p); + + if (p->result == PATH_SUCCESS) + p->result = f; + + unit_log_result(UNIT(p), p->result == PATH_SUCCESS, path_result_to_string(p->result)); + path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD); +} + +static void path_enter_running(Path *p) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + int r; + + assert(p); + + /* Don't start job if we are supposed to go down */ + if (unit_stop_pending(UNIT(p))) + return; + + trigger = UNIT_TRIGGER(UNIT(p)); + if (!trigger) { + log_unit_error(UNIT(p), "Unit to trigger vanished."); + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return; + } + + r = manager_add_job(UNIT(p)->manager, JOB_START, trigger, JOB_REPLACE, &error, NULL); + if (r < 0) + goto fail; + + p->inotify_triggered = false; + + r = path_watch(p); + if (r < 0) + goto fail; + + path_set_state(p, PATH_RUNNING); + return; + +fail: + log_unit_warning(UNIT(p), "Failed to queue unit startup job: %s", bus_error_message(&error, r)); + path_enter_dead(p, PATH_FAILURE_RESOURCES); +} + +static bool path_check_good(Path *p, bool initial) { + PathSpec *s; + bool good = false; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + good = path_spec_check_good(s, initial); + + if (good) + break; + } + + return good; +} + +static void path_enter_waiting(Path *p, bool initial, bool recheck) { + int r; + + if (recheck) + if (path_check_good(p, initial)) { + log_unit_debug(UNIT(p), "Got triggered."); + path_enter_running(p); + return; + } + + r = path_watch(p); + if (r < 0) + goto fail; + + /* Hmm, so now we have created inotify watches, but the file + * might have appeared/been removed by now, so we must + * recheck */ + + if (recheck) + if (path_check_good(p, false)) { + log_unit_debug(UNIT(p), "Got triggered."); + path_enter_running(p); + return; + } + + path_set_state(p, PATH_WAITING); + return; + +fail: + log_unit_warning_errno(UNIT(p), r, "Failed to enter waiting state: %m"); + path_enter_dead(p, PATH_FAILURE_RESOURCES); +} + +static void path_mkdir(Path *p) { + PathSpec *s; + + assert(p); + + if (!p->make_directory) + return; + + LIST_FOREACH(spec, s, p->specs) + path_spec_mkdir(s, p->directory_mode); +} + +static int path_start(Unit *u) { + Path *p = PATH(u); + Unit *trigger; + int r; + + assert(p); + assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED)); + + trigger = UNIT_TRIGGER(u); + if (!trigger || trigger->load_state != UNIT_LOADED) { + log_unit_error(u, "Refusing to start, unit to trigger not loaded."); + return -ENOENT; + } + + r = unit_start_limit_test(u); + if (r < 0) { + path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + path_mkdir(p); + + p->result = PATH_SUCCESS; + path_enter_waiting(p, true, true); + + return 1; +} + +static int path_stop(Unit *u) { + Path *p = PATH(u); + + assert(p); + assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING)); + + path_enter_dead(p, PATH_SUCCESS); + return 1; +} + +static int path_serialize(Unit *u, FILE *f, FDSet *fds) { + Path *p = PATH(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", path_state_to_string(p->state)); + (void) serialize_item(f, "result", path_result_to_string(p->result)); + + return 0; +} + +static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Path *p = PATH(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + PathState state; + + state = path_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + p->deserialized_state = state; + + } else if (streq(key, "result")) { + PathResult f; + + f = path_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != PATH_SUCCESS) + p->result = f; + + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState path_active_state(Unit *u) { + assert(u); + + return state_translation_table[PATH(u)->state]; +} + +_pure_ static const char *path_sub_state_to_string(Unit *u) { + assert(u); + + return path_state_to_string(PATH(u)->state); +} + +static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + PathSpec *s = userdata; + Path *p; + int changed; + + assert(s); + assert(s->unit); + assert(fd >= 0); + + p = PATH(s->unit); + + if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING)) + return 0; + + /* log_debug("inotify wakeup on %s.", u->id); */ + + LIST_FOREACH(spec, s, p->specs) + if (path_spec_owns_inotify_fd(s, fd)) + break; + + if (!s) { + log_error("Got event on unknown fd."); + goto fail; + } + + changed = path_spec_fd_event(s, revents); + if (changed < 0) + goto fail; + + /* If we are already running, then remember that one event was + * dispatched so that we restart the service only if something + * actually changed on disk */ + p->inotify_triggered = true; + + if (changed) + path_enter_running(p); + else + path_enter_waiting(p, false, true); + + return 0; + +fail: + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return 0; +} + +static void path_trigger_notify(Unit *u, Unit *other) { + Path *p = PATH(u); + + assert(u); + assert(other); + + /* Invoked whenever the unit we trigger changes state or gains + * or loses a job */ + + if (other->load_state != UNIT_LOADED) + return; + + if (p->state == PATH_RUNNING && + UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + log_unit_debug(UNIT(p), "Got notified about unit deactivation."); + + /* Hmm, so inotify was triggered since the + * last activation, so I guess we need to + * recheck what is going on. */ + path_enter_waiting(p, false, p->inotify_triggered); + } +} + +static void path_reset_failed(Unit *u) { + Path *p = PATH(u); + + assert(p); + + if (p->state == PATH_FAILED) + path_set_state(p, PATH_DEAD); + + p->result = PATH_SUCCESS; +} + +static const char* const path_type_table[_PATH_TYPE_MAX] = { + [PATH_EXISTS] = "PathExists", + [PATH_EXISTS_GLOB] = "PathExistsGlob", + [PATH_DIRECTORY_NOT_EMPTY] = "DirectoryNotEmpty", + [PATH_CHANGED] = "PathChanged", + [PATH_MODIFIED] = "PathModified", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_type, PathType); + +static const char* const path_result_table[_PATH_RESULT_MAX] = { + [PATH_SUCCESS] = "success", + [PATH_FAILURE_RESOURCES] = "resources", + [PATH_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_result, PathResult); + +const UnitVTable path_vtable = { + .object_size = sizeof(Path), + + .sections = + "Unit\0" + "Path\0" + "Install\0", + .private_section = "Path", + + .can_transient = true, + + .init = path_init, + .done = path_done, + .load = path_load, + + .coldplug = path_coldplug, + + .dump = path_dump, + + .start = path_start, + .stop = path_stop, + + .serialize = path_serialize, + .deserialize_item = path_deserialize_item, + + .active_state = path_active_state, + .sub_state_to_string = path_sub_state_to_string, + + .trigger_notify = path_trigger_notify, + + .reset_failed = path_reset_failed, + + .bus_vtable = bus_path_vtable, + .bus_set_property = bus_path_set_property, +}; diff --git a/src/core/path.h b/src/core/path.h new file mode 100644 index 0000000..4d4b623 --- /dev/null +++ b/src/core/path.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Path Path; +typedef struct PathSpec PathSpec; + +#include "unit.h" + +typedef enum PathType { + PATH_EXISTS, + PATH_EXISTS_GLOB, + PATH_DIRECTORY_NOT_EMPTY, + PATH_CHANGED, + PATH_MODIFIED, + _PATH_TYPE_MAX, + _PATH_TYPE_INVALID = -1 +} PathType; + +typedef struct PathSpec { + Unit *unit; + + char *path; + + sd_event_source *event_source; + + LIST_FIELDS(struct PathSpec, spec); + + PathType type; + int inotify_fd; + int primary_wd; + + bool previous_exists; +} PathSpec; + +int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler); +void path_spec_unwatch(PathSpec *s); +int path_spec_fd_event(PathSpec *s, uint32_t events); +void path_spec_done(PathSpec *s); + +static inline bool path_spec_owns_inotify_fd(PathSpec *s, int fd) { + return s->inotify_fd == fd; +} + +typedef enum PathResult { + PATH_SUCCESS, + PATH_FAILURE_RESOURCES, + PATH_FAILURE_START_LIMIT_HIT, + _PATH_RESULT_MAX, + _PATH_RESULT_INVALID = -1 +} PathResult; + +struct Path { + Unit meta; + + LIST_HEAD(PathSpec, specs); + + PathState state, deserialized_state; + + bool inotify_triggered; + + bool make_directory; + mode_t directory_mode; + + PathResult result; +}; + +void path_free_specs(Path *p); + +extern const UnitVTable path_vtable; + +const char* path_type_to_string(PathType i) _const_; +PathType path_type_from_string(const char *s) _pure_; + +const char* path_result_to_string(PathResult i) _const_; +PathResult path_result_from_string(const char *s) _pure_; + +DEFINE_CAST(PATH, Path); diff --git a/src/core/scope.c b/src/core/scope.c new file mode 100644 index 0000000..e478661 --- /dev/null +++ b/src/core/scope.c @@ -0,0 +1,624 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "load-dropin.h" +#include "log.h" +#include "scope.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = { + [SCOPE_DEAD] = UNIT_INACTIVE, + [SCOPE_RUNNING] = UNIT_ACTIVE, + [SCOPE_ABANDONED] = UNIT_ACTIVE, + [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SCOPE_FAILED] = UNIT_FAILED +}; + +static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); + +static void scope_init(Unit *u) { + Scope *s = SCOPE(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->timeout_stop_usec = u->manager->default_timeout_stop_usec; + u->ignore_on_isolate = true; +} + +static void scope_done(Unit *u) { + Scope *s = SCOPE(u); + + assert(u); + + s->controller = mfree(s->controller); + s->controller_track = sd_bus_track_unref(s->controller_track); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); +} + +static int scope_arm_timer(Scope *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + scope_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "scope-timer"); + + return 0; +} + +static void scope_set_state(Scope *s, ScopeState state) { + ScopeState old_state; + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) { + unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } + + if (state != old_state) + log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int scope_add_default_dependencies(Scope *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Make sure scopes are unloaded on shutdown */ + r = unit_add_two_dependencies_by_name( + UNIT(s), + UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_SHUTDOWN_TARGET, true, + UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int scope_verify(Scope *s) { + assert(s); + + if (UNIT(s)->load_state != UNIT_LOADED) + return 0; + + if (set_isempty(UNIT(s)->pids) && + !MANAGER_IS_RELOADING(UNIT(s)->manager) && + !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) { + log_unit_error(UNIT(s), "Scope has no PIDs. Refusing."); + return -ENOENT; + } + + return 0; +} + +static int scope_load_init_scope(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_INIT_SCOPE)) + return 0; + + u->transient = true; + u->perpetual = true; + + /* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we + * synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + /* Prettify things, if we can. */ + if (!u->description) + u->description = strdup("System and Service Manager"); + if (!u->documentation) + (void) strv_extend(&u->documentation, "man:systemd(1)"); + + return 1; +} + +static int scope_load(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(u->load_state == UNIT_STUB); + + if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) + /* Refuse to load non-transient scope units, but allow them while reloading. */ + return -ENOENT; + + r = scope_load_init_scope(u); + if (r < 0) + return r; + r = unit_load_fragment_and_dropin_optional(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_LOADED) { + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = scope_add_default_dependencies(s); + if (r < 0) + return r; + } + + return scope_verify(s); +} + +static int scope_coldplug(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(s->state == SCOPE_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + if (IN_SET(s->deserialized_state, SCOPE_STOP_SIGKILL, SCOPE_STOP_SIGTERM)) { + r = scope_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_stop_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) + (void) unit_enqueue_rewatch_pids(u); + + bus_scope_track_controller(s); + + scope_set_state(s, s->deserialized_state); + return 0; +} + +static void scope_dump(Unit *u, FILE *f, const char *prefix) { + Scope *s = SCOPE(u); + + assert(s); + assert(f); + + fprintf(f, + "%sScope State: %s\n" + "%sResult: %s\n", + prefix, scope_state_to_string(s->state), + prefix, scope_result_to_string(s->result)); + + cgroup_context_dump(&s->cgroup_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); +} + +static void scope_enter_dead(Scope *s, ScopeResult f) { + assert(s); + + if (s->result == SCOPE_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result)); + scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD); +} + +static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) { + bool skip_signal = false; + int r; + + assert(s); + + if (s->result == SCOPE_SUCCESS) + s->result = f; + + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going + * directly into SIGTERM berserk mode */ + if (state == SCOPE_STOP_SIGTERM) + skip_signal = bus_scope_send_request_stop(s) > 0; + + if (skip_signal) + r = 1; /* wait */ + else { + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state != SCOPE_STOP_SIGTERM ? KILL_KILL : + s->was_abandoned ? KILL_TERMINATE_AND_LOG : + KILL_TERMINATE, + -1, -1, false); + if (r < 0) + goto fail; + } + + if (r > 0) { + r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec)); + if (r < 0) + goto fail; + + scope_set_state(s, state); + } else if (state == SCOPE_STOP_SIGTERM) + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS); + else + scope_enter_dead(s, SCOPE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); +} + +static int scope_start(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return -EPERM; + + if (s->state == SCOPE_FAILED) + return -EPERM; + + /* We can't fulfill this right now, please try again later */ + if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + return -EAGAIN; + + assert(s->state == SCOPE_DEAD); + + if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) + return -ENOENT; + + (void) bus_scope_track_controller(s); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + (void) unit_realize_cgroup(u); + (void) unit_reset_cpu_accounting(u); + (void) unit_reset_ip_accounting(u); + + unit_export_state_files(UNIT(s)); + + r = unit_attach_pids_to_cgroup(u, UNIT(s)->pids, NULL); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to add PIDs to scope's control group: %m"); + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); + return r; + } + + s->result = SCOPE_SUCCESS; + + scope_set_state(s, SCOPE_RUNNING); + + /* Start watching the PIDs currently in the scope */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + return 1; +} + +static int scope_stop(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + return 0; + + assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)); + + scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS); + return 1; +} + +static void scope_reset_failed(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (s->state == SCOPE_FAILED) + scope_set_state(s, SCOPE_DEAD); + + s->result = SCOPE_SUCCESS; +} + +static int scope_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, -1, error); +} + +static int scope_get_timeout(Unit *u, usec_t *timeout) { + Scope *s = SCOPE(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static int scope_serialize(Unit *u, FILE *f, FDSet *fds) { + Scope *s = SCOPE(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", scope_state_to_string(s->state)); + (void) serialize_bool(f, "was-abandoned", s->was_abandoned); + + if (s->controller) + (void) serialize_item(f, "controller", s->controller); + + return 0; +} + +static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Scope *s = SCOPE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + ScopeState state; + + state = scope_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + + } else if (streq(key, "was-abandoned")) { + int k; + + k = parse_boolean(value); + if (k < 0) + log_unit_debug(u, "Failed to parse boolean value: %s", value); + else + s->was_abandoned = k; + } else if (streq(key, "controller")) { + + r = free_and_strdup(&s->controller, value); + if (r < 0) + return log_oom(); + + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void scope_notify_cgroup_empty_event(Unit *u) { + Scope *s = SCOPE(u); + assert(u); + + log_unit_debug(u, "cgroup is empty"); + + if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + scope_enter_dead(s, SCOPE_SUCCESS); +} + +static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { + assert(u); + + /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to + * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original + * process we watched was probably the parent of them, and they are hence now our children. */ + + (void) unit_enqueue_rewatch_pids(u); +} + +static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Scope *s = SCOPE(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SCOPE_STOP_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + } + + break; + + case SCOPE_STOP_SIGKILL: + log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +int scope_abandon(Scope *s) { + assert(s); + + if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) + return -EPERM; + + if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)) + return -ESTALE; + + s->was_abandoned = true; + + s->controller = mfree(s->controller); + s->controller_track = sd_bus_track_unref(s->controller_track); + + scope_set_state(s, SCOPE_ABANDONED); + + /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that + * the remaining processes will be sooner or later reassigned to us as parent. */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + return 0; +} + +_pure_ static UnitActiveState scope_active_state(Unit *u) { + assert(u); + + return state_translation_table[SCOPE(u)->state]; +} + +_pure_ static const char *scope_sub_state_to_string(Unit *u) { + assert(u); + + return scope_state_to_string(SCOPE(u)->state); +} + +static void scope_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Let's unconditionally add the "init.scope" special unit + * that encapsulates PID 1. Note that PID 1 already is in the + * cgroup for this, we hence just need to allocate the object + * for it and that's it. */ + + u = manager_get_unit(m, SPECIAL_INIT_SCOPE); + if (!u) { + r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u); + if (r < 0) { + log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m"); + return; + } + } + + u->transient = true; + u->perpetual = true; + SCOPE(u)->deserialized_state = SCOPE_RUNNING; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); +} + +static const char* const scope_result_table[_SCOPE_RESULT_MAX] = { + [SCOPE_SUCCESS] = "success", + [SCOPE_FAILURE_RESOURCES] = "resources", + [SCOPE_FAILURE_TIMEOUT] = "timeout", +}; + +DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult); + +const UnitVTable scope_vtable = { + .object_size = sizeof(Scope), + .cgroup_context_offset = offsetof(Scope, cgroup_context), + .kill_context_offset = offsetof(Scope, kill_context), + + .sections = + "Unit\0" + "Scope\0" + "Install\0", + .private_section = "Scope", + + .can_transient = true, + .can_delegate = true, + .once_only = true, + + .init = scope_init, + .load = scope_load, + .done = scope_done, + + .coldplug = scope_coldplug, + + .dump = scope_dump, + + .start = scope_start, + .stop = scope_stop, + + .kill = scope_kill, + + .get_timeout = scope_get_timeout, + + .serialize = scope_serialize, + .deserialize_item = scope_deserialize_item, + + .active_state = scope_active_state, + .sub_state_to_string = scope_sub_state_to_string, + + .sigchld_event = scope_sigchld_event, + + .reset_failed = scope_reset_failed, + + .notify_cgroup_empty = scope_notify_cgroup_empty_event, + + .bus_vtable = bus_scope_vtable, + .bus_set_property = bus_scope_set_property, + .bus_commit_properties = bus_scope_commit_properties, + + .enumerate_perpetual = scope_enumerate_perpetual, +}; diff --git a/src/core/scope.h b/src/core/scope.h new file mode 100644 index 0000000..c38afb5 --- /dev/null +++ b/src/core/scope.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Scope Scope; + +#include "cgroup.h" +#include "kill.h" +#include "unit.h" + +typedef enum ScopeResult { + SCOPE_SUCCESS, + SCOPE_FAILURE_RESOURCES, + SCOPE_FAILURE_TIMEOUT, + _SCOPE_RESULT_MAX, + _SCOPE_RESULT_INVALID = -1 +} ScopeResult; + +struct Scope { + Unit meta; + + CGroupContext cgroup_context; + KillContext kill_context; + + ScopeState state, deserialized_state; + ScopeResult result; + + usec_t timeout_stop_usec; + + char *controller; + sd_bus_track *controller_track; + + bool was_abandoned; + + sd_event_source *timer_event_source; +}; + +extern const UnitVTable scope_vtable; + +int scope_abandon(Scope *s); + +const char* scope_result_to_string(ScopeResult i) _const_; +ScopeResult scope_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SCOPE, Scope); diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c new file mode 100644 index 0000000..0c6d885 --- /dev/null +++ b/src/core/selinux-access.c @@ -0,0 +1,271 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "selinux-access.h" + +#if HAVE_SELINUX + +#include <errno.h> +#include <selinux/avc.h> +#include <selinux/selinux.h> +#include <stdio.h> +#if HAVE_AUDIT +#include <libaudit.h> +#endif + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "audit-fd.h" +#include "bus-util.h" +#include "log.h" +#include "path-util.h" +#include "selinux-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "util.h" + +static bool initialized = false; + +struct audit_info { + sd_bus_creds *creds; + const char *path; + const char *cmdline; +}; + +/* + Any time an access gets denied this callback will be called + with the audit data. We then need to just copy the audit data into the msgbuf. +*/ +static int audit_callback( + void *auditdata, + security_class_t cls, + char *msgbuf, + size_t msgbufsize) { + + const struct audit_info *audit = auditdata; + uid_t uid = 0, login_uid = 0; + gid_t gid = 0; + char login_uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a"; + char uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a"; + char gid_buf[DECIMAL_STR_MAX(gid_t) + 1] = "n/a"; + + if (sd_bus_creds_get_audit_login_uid(audit->creds, &login_uid) >= 0) + xsprintf(login_uid_buf, UID_FMT, login_uid); + if (sd_bus_creds_get_euid(audit->creds, &uid) >= 0) + xsprintf(uid_buf, UID_FMT, uid); + if (sd_bus_creds_get_egid(audit->creds, &gid) >= 0) + xsprintf(gid_buf, GID_FMT, gid); + + snprintf(msgbuf, msgbufsize, + "auid=%s uid=%s gid=%s%s%s%s%s%s%s", + login_uid_buf, uid_buf, gid_buf, + audit->path ? " path=\"" : "", strempty(audit->path), audit->path ? "\"" : "", + audit->cmdline ? " cmdline=\"" : "", strempty(audit->cmdline), audit->cmdline ? "\"" : ""); + + return 0; +} + +static int callback_type_to_priority(int type) { + switch(type) { + + case SELINUX_ERROR: + return LOG_ERR; + + case SELINUX_WARNING: + return LOG_WARNING; + + case SELINUX_INFO: + return LOG_INFO; + + case SELINUX_AVC: + default: + return LOG_NOTICE; + } +} + +/* + libselinux uses this callback when access gets denied or other + events happen. If audit is turned on, messages will be reported + using audit netlink, otherwise they will be logged using the usual + channels. + + Code copied from dbus and modified. +*/ +_printf_(2, 3) static int log_callback(int type, const char *fmt, ...) { + va_list ap; + const char *fmt2; + +#if HAVE_AUDIT + int fd; + + fd = get_audit_fd(); + + if (fd >= 0) { + _cleanup_free_ char *buf = NULL; + int r; + + va_start(ap, fmt); + r = vasprintf(&buf, fmt, ap); + va_end(ap); + + if (r >= 0) { + audit_log_user_avc_message(fd, AUDIT_USER_AVC, buf, NULL, NULL, NULL, 0); + return 0; + } + } +#endif + + fmt2 = strjoina("selinux: ", fmt); + + va_start(ap, fmt); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-nonliteral" + log_internalv(LOG_AUTH | callback_type_to_priority(type), + 0, __FILE__, __LINE__, __FUNCTION__, + fmt2, ap); +#pragma GCC diagnostic pop + va_end(ap); + + return 0; +} + +static int access_init(sd_bus_error *error) { + + if (!mac_selinux_use()) + return 0; + + if (initialized) + return 1; + + if (avc_open(NULL, 0) != 0) { + int enforce, saved_errno = errno; + + enforce = security_getenforce(); + log_full_errno(enforce != 0 ? LOG_ERR : LOG_WARNING, saved_errno, "Failed to open the SELinux AVC: %m"); + + /* If enforcement isn't on, then let's suppress this + * error, and just don't do any AVC checks. The + * warning we printed is hence all the admin will + * see. */ + if (enforce == 0) + return 0; + + /* Return an access denied error, if we couldn't load + * the AVC but enforcing mode was on, or we couldn't + * determine whether it is one. */ + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to open the SELinux AVC: %s", strerror(saved_errno)); + } + + selinux_set_callback(SELINUX_CB_AUDIT, (union selinux_callback) audit_callback); + selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) log_callback); + + initialized = true; + return 1; +} + +/* + This function communicates with the kernel to check whether or not it should + allow the access. + If the machine is in permissive mode it will return ok. Audit messages will + still be generated if the access would be denied in enforcing mode. +*/ +int mac_selinux_generic_access_check( + sd_bus_message *message, + const char *path, + const char *permission, + sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *tclass = NULL, *scon = NULL; + struct audit_info audit_info = {}; + _cleanup_free_ char *cl = NULL; + char *fcon = NULL; + char **cmdline = NULL; + int r = 0; + + assert(message); + assert(permission); + assert(error); + + r = access_init(error); + if (r <= 0) + return r; + + r = sd_bus_query_sender_creds( + message, + SD_BUS_CREDS_PID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID| + SD_BUS_CREDS_CMDLINE|SD_BUS_CREDS_AUDIT_LOGIN_UID| + SD_BUS_CREDS_SELINUX_CONTEXT| + SD_BUS_CREDS_AUGMENT /* get more bits from /proc */, + &creds); + if (r < 0) + goto finish; + + /* The SELinux context is something we really should have + * gotten directly from the message or sender, and not be an + * augmented field. If it was augmented we cannot use it for + * authorization, since this is racy and vulnerable. Let's add + * an extra check, just in case, even though this really + * shouldn't be possible. */ + assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_SELINUX_CONTEXT) == 0, -EPERM); + + r = sd_bus_creds_get_selinux_context(creds, &scon); + if (r < 0) + goto finish; + + if (path) { + /* Get the file context of the unit file */ + + r = getfilecon_raw(path, &fcon); + if (r < 0) { + r = sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get file context on %s.", path); + goto finish; + } + + tclass = "service"; + } else { + r = getcon_raw(&fcon); + if (r < 0) { + r = sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get current context."); + goto finish; + } + + tclass = "system"; + } + + sd_bus_creds_get_cmdline(creds, &cmdline); + cl = strv_join(cmdline, " "); + + audit_info.creds = creds; + audit_info.path = path; + audit_info.cmdline = cl; + + r = selinux_check_access(scon, fcon, tclass, permission, &audit_info); + if (r < 0) + r = sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "SELinux policy denies access."); + + log_debug("SELinux access check scon=%s tcon=%s tclass=%s perm=%s path=%s cmdline=%s: %i", scon, fcon, tclass, permission, path, cl, r); + +finish: + freecon(fcon); + + if (r < 0 && security_getenforce() != 1) { + sd_bus_error_free(error); + r = 0; + } + + return r; +} + +#else + +int mac_selinux_generic_access_check( + sd_bus_message *message, + const char *path, + const char *permission, + sd_bus_error *error) { + + return 0; +} + +#endif diff --git a/src/core/selinux-access.h b/src/core/selinux-access.h new file mode 100644 index 0000000..1e75930 --- /dev/null +++ b/src/core/selinux-access.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" + +#include "bus-util.h" +#include "manager.h" + +int mac_selinux_generic_access_check(sd_bus_message *message, const char *path, const char *permission, sd_bus_error *error); + +#if HAVE_SELINUX + +#define mac_selinux_access_check(message, permission, error) \ + mac_selinux_generic_access_check((message), NULL, (permission), (error)) + +#define mac_selinux_unit_access_check(unit, message, permission, error) \ + mac_selinux_generic_access_check((message), unit_label_path(unit), (permission), (error)) + +#else + +#define mac_selinux_access_check(message, permission, error) 0 +#define mac_selinux_unit_access_check(unit, message, permission, error) 0 + +#endif diff --git a/src/core/selinux-setup.c b/src/core/selinux-setup.c new file mode 100644 index 0000000..bac1aa3 --- /dev/null +++ b/src/core/selinux-setup.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <stdio.h> +#include <unistd.h> + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif + +#include "log.h" +#include "macro.h" +#include "selinux-setup.h" +#include "selinux-util.h" +#include "string-util.h" +#include "util.h" + +#if HAVE_SELINUX +_printf_(2,3) +static int null_log(int type, const char *fmt, ...) { + return 0; +} +#endif + +int mac_selinux_setup(bool *loaded_policy) { + +#if HAVE_SELINUX + int enforce = 0; + usec_t before_load, after_load; + char *con; + int r; + static const union selinux_callback cb = { + .func_log = null_log, + }; + + bool initialized = false; + + assert(loaded_policy); + + /* Turn off all of SELinux' own logging, we want to do that */ + selinux_set_callback(SELINUX_CB_LOG, cb); + + /* Don't load policy in the initrd if we don't appear to have + * it. For the real root, we check below if we've already + * loaded policy, and return gracefully. + */ + if (in_initrd() && access(selinux_path(), F_OK) < 0) + return 0; + + /* Already initialized by somebody else? */ + r = getcon_raw(&con); + if (r == 0) { + initialized = !streq(con, "kernel"); + freecon(con); + } + + /* Make sure we have no fds open while loading the policy and + * transitioning */ + log_close(); + + /* Now load the policy */ + before_load = now(CLOCK_MONOTONIC); + r = selinux_init_load_policy(&enforce); + if (r == 0) { + _cleanup_(mac_selinux_freep) char *label = NULL; + char timespan[FORMAT_TIMESPAN_MAX]; + + mac_selinux_retest(); + + /* Transition to the new context */ + r = mac_selinux_get_create_label_from_exe(SYSTEMD_BINARY_PATH, &label); + if (r < 0 || !label) { + log_open(); + log_error("Failed to compute init label, ignoring."); + } else { + r = setcon_raw(label); + + log_open(); + if (r < 0) + log_error("Failed to transition into init label '%s', ignoring.", label); + } + + after_load = now(CLOCK_MONOTONIC); + + log_info("Successfully loaded SELinux policy in %s.", + format_timespan(timespan, sizeof(timespan), after_load - before_load, 0)); + + *loaded_policy = true; + + } else { + log_open(); + + if (enforce > 0) { + if (!initialized) { + log_emergency("Failed to load SELinux policy."); + return -EIO; + } + + log_warning("Failed to load new SELinux policy. Continuing with old policy."); + } else + log_debug("Unable to load SELinux policy. Ignoring."); + } +#endif + + return 0; +} diff --git a/src/core/selinux-setup.h b/src/core/selinux-setup.h new file mode 100644 index 0000000..ad0d4f6 --- /dev/null +++ b/src/core/selinux-setup.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> + +int mac_selinux_setup(bool *loaded_policy); diff --git a/src/core/service.c b/src/core/service.c new file mode 100644 index 0000000..324dcf2 --- /dev/null +++ b/src/core/service.c @@ -0,0 +1,4161 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <signal.h> +#include <unistd.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-util.h" +#include "dbus-service.h" +#include "dbus-unit.h" +#include "def.h" +#include "env-util.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "manager.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "service.h" +#include "signal-util.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "utf8.h" +#include "util.h" + +static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = { + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_START_PRE] = UNIT_ACTIVATING, + [SERVICE_START] = UNIT_ACTIVATING, + [SERVICE_START_POST] = UNIT_ACTIVATING, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING +}; + +/* For Type=idle we never want to delay any other jobs, hence we + * consider idle jobs active as soon as we start working on them */ +static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = { + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_START_PRE] = UNIT_ACTIVE, + [SERVICE_START] = UNIT_ACTIVE, + [SERVICE_START_POST] = UNIT_ACTIVE, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING +}; + +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata); +static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata); + +static void service_enter_signal(Service *s, ServiceState state, ServiceResult f); +static void service_enter_reload_by_notify(Service *s); + +static void service_init(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->timeout_start_usec = u->manager->default_timeout_start_usec; + s->timeout_stop_usec = u->manager->default_timeout_stop_usec; + s->restart_usec = u->manager->default_restart_usec; + s->runtime_max_usec = USEC_INFINITY; + s->type = _SERVICE_TYPE_INVALID; + s->socket_fd = -1; + s->stdin_fd = s->stdout_fd = s->stderr_fd = -1; + s->guess_main_pid = true; + + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + s->exec_context.keyring_mode = MANAGER_IS_SYSTEM(u->manager) ? + EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT; + + s->watchdog_original_usec = USEC_INFINITY; +} + +static void service_unwatch_control_pid(Service *s) { + assert(s); + + if (s->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->control_pid); + s->control_pid = 0; +} + +static void service_unwatch_main_pid(Service *s) { + assert(s); + + if (s->main_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->main_pid); + s->main_pid = 0; +} + +static void service_unwatch_pid_file(Service *s) { + if (!s->pid_file_pathspec) + return; + + log_unit_debug(UNIT(s), "Stopping watch for PID file %s", s->pid_file_pathspec->path); + path_spec_unwatch(s->pid_file_pathspec); + path_spec_done(s->pid_file_pathspec); + s->pid_file_pathspec = mfree(s->pid_file_pathspec); +} + +static int service_set_main_pid(Service *s, pid_t pid) { + pid_t ppid; + + assert(s); + + if (pid <= 1) + return -EINVAL; + + if (pid == getpid_cached()) + return -EINVAL; + + if (s->main_pid == pid && s->main_pid_known) + return 0; + + if (s->main_pid != pid) { + service_unwatch_main_pid(s); + exec_status_start(&s->main_exec_status, pid); + } + + s->main_pid = pid; + s->main_pid_known = true; + + if (get_process_ppid(pid, &ppid) >= 0 && ppid != getpid_cached()) { + log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", pid); + s->main_pid_alien = true; + } else + s->main_pid_alien = false; + + return 0; +} + +void service_close_socket_fd(Service *s) { + assert(s); + + /* Undo the effect of service_set_socket_fd(). */ + + s->socket_fd = asynchronous_close(s->socket_fd); + + if (UNIT_ISSET(s->accept_socket)) { + socket_connection_unref(SOCKET(UNIT_DEREF(s->accept_socket))); + unit_ref_unset(&s->accept_socket); + } +} + +static void service_stop_watchdog(Service *s) { + assert(s); + + s->watchdog_event_source = sd_event_source_unref(s->watchdog_event_source); + s->watchdog_timestamp = DUAL_TIMESTAMP_NULL; +} + +static usec_t service_get_watchdog_usec(Service *s) { + assert(s); + + if (s->watchdog_override_enable) + return s->watchdog_override_usec; + + return s->watchdog_original_usec; +} + +static void service_start_watchdog(Service *s) { + usec_t watchdog_usec; + int r; + + assert(s); + + watchdog_usec = service_get_watchdog_usec(s); + if (IN_SET(watchdog_usec, 0, USEC_INFINITY)) { + service_stop_watchdog(s); + return; + } + + if (s->watchdog_event_source) { + r = sd_event_source_set_time(s->watchdog_event_source, usec_add(s->watchdog_timestamp.monotonic, watchdog_usec)); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to reset watchdog timer: %m"); + return; + } + + r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ONESHOT); + } else { + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->watchdog_event_source, + CLOCK_MONOTONIC, + usec_add(s->watchdog_timestamp.monotonic, watchdog_usec), 0, + service_dispatch_watchdog, s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to add watchdog timer: %m"); + return; + } + + (void) sd_event_source_set_description(s->watchdog_event_source, "service-watchdog"); + + /* Let's process everything else which might be a sign + * of living before we consider a service died. */ + r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE); + } + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m"); +} + +static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) { + usec_t current; + int r; + + assert(s); + + /* Extends the specified event source timer to at least the specified time, unless it is already later + * anyway. */ + + if (!source) + return; + + r = sd_event_source_get_time(source, ¤t); + if (r < 0) { + const char *desc; + (void) sd_event_source_get_description(s->timer_event_source, &desc); + log_unit_warning_errno(UNIT(s), r, "Failed to retrieve timeout time for event source '%s', ignoring: %m", strna(desc)); + return; + } + + if (current >= extended) /* Current timeout is already longer, ignore this. */ + return; + + r = sd_event_source_set_time(source, extended); + if (r < 0) { + const char *desc; + (void) sd_event_source_get_description(s->timer_event_source, &desc); + log_unit_warning_errno(UNIT(s), r, "Failed to set timeout time for even source '%s', ignoring %m", strna(desc)); + } +} + +static void service_extend_timeout(Service *s, usec_t extend_timeout_usec) { + usec_t extended; + + assert(s); + + if (IN_SET(extend_timeout_usec, 0, USEC_INFINITY)) + return; + + extended = usec_add(now(CLOCK_MONOTONIC), extend_timeout_usec); + + service_extend_event_source_timeout(s, s->timer_event_source, extended); + service_extend_event_source_timeout(s, s->watchdog_event_source, extended); +} + +static void service_reset_watchdog(Service *s) { + assert(s); + + dual_timestamp_get(&s->watchdog_timestamp); + service_start_watchdog(s); +} + +static void service_override_watchdog_timeout(Service *s, usec_t watchdog_override_usec) { + assert(s); + + s->watchdog_override_enable = true; + s->watchdog_override_usec = watchdog_override_usec; + service_reset_watchdog(s); + + log_unit_debug(UNIT(s), "watchdog_usec="USEC_FMT, s->watchdog_usec); + log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec); +} + +static void service_fd_store_unlink(ServiceFDStore *fs) { + + if (!fs) + return; + + if (fs->service) { + assert(fs->service->n_fd_store > 0); + LIST_REMOVE(fd_store, fs->service->fd_store, fs); + fs->service->n_fd_store--; + } + + if (fs->event_source) { + sd_event_source_set_enabled(fs->event_source, SD_EVENT_OFF); + sd_event_source_unref(fs->event_source); + } + + free(fs->fdname); + safe_close(fs->fd); + free(fs); +} + +static void service_release_fd_store(Service *s) { + assert(s); + + if (s->n_keep_fd_store > 0) + return; + + log_unit_debug(UNIT(s), "Releasing all stored fds"); + while (s->fd_store) + service_fd_store_unlink(s->fd_store); + + assert(s->n_fd_store == 0); +} + +static void service_release_resources(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (!s->fd_store && s->stdin_fd < 0 && s->stdout_fd < 0 && s->stderr_fd < 0) + return; + + log_unit_debug(u, "Releasing resources."); + + s->stdin_fd = safe_close(s->stdin_fd); + s->stdout_fd = safe_close(s->stdout_fd); + s->stderr_fd = safe_close(s->stderr_fd); + + service_release_fd_store(s); +} + +static void service_done(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + s->pid_file = mfree(s->pid_file); + s->status_text = mfree(s->status_text); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, false); + exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); + s->control_command = NULL; + s->main_command = NULL; + + dynamic_creds_unref(&s->dynamic_creds); + + exit_status_set_free(&s->restart_prevent_status); + exit_status_set_free(&s->restart_force_status); + exit_status_set_free(&s->success_status); + + /* This will leak a process, but at least no memory or any of + * our resources */ + service_unwatch_main_pid(s); + service_unwatch_control_pid(s); + service_unwatch_pid_file(s); + + if (s->bus_name) { + unit_unwatch_bus_name(u, s->bus_name); + s->bus_name = mfree(s->bus_name); + } + + s->bus_name_owner = mfree(s->bus_name_owner); + + s->usb_function_descriptors = mfree(s->usb_function_descriptors); + s->usb_function_strings = mfree(s->usb_function_strings); + + service_close_socket_fd(s); + s->peer = socket_peer_unref(s->peer); + + unit_ref_unset(&s->accept_socket); + + service_stop_watchdog(s); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + service_release_resources(u); +} + +static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) { + ServiceFDStore *fs = userdata; + + assert(e); + assert(fs); + + /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */ + log_unit_debug(UNIT(fs->service), + "Received %s on stored fd %d (%s), closing.", + revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP", + fs->fd, strna(fs->fdname)); + service_fd_store_unlink(fs); + return 0; +} + +static int service_add_fd_store(Service *s, int fd, const char *name) { + ServiceFDStore *fs; + int r; + + /* fd is always consumed if we return >= 0 */ + + assert(s); + assert(fd >= 0); + + if (s->n_fd_store >= s->n_fd_store_max) + return -EXFULL; /* Our store is full. + * Use this errno rather than E[NM]FILE to distinguish from + * the case where systemd itself hits the file limit. */ + + LIST_FOREACH(fd_store, fs, s->fd_store) { + r = same_fd(fs->fd, fd); + if (r < 0) + return r; + if (r > 0) { + safe_close(fd); + return 0; /* fd already included */ + } + } + + fs = new0(ServiceFDStore, 1); + if (!fs) + return -ENOMEM; + + fs->fd = fd; + fs->service = s; + fs->fdname = strdup(name ?: "stored"); + if (!fs->fdname) { + free(fs); + return -ENOMEM; + } + + r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fd, 0, on_fd_store_io, fs); + if (r < 0 && r != -EPERM) { /* EPERM indicates fds that aren't pollable, which is OK */ + free(fs->fdname); + free(fs); + return r; + } else if (r >= 0) + (void) sd_event_source_set_description(fs->event_source, "service-fd-store"); + + LIST_PREPEND(fd_store, s->fd_store, fs); + s->n_fd_store++; + + return 1; /* fd newly stored */ +} + +static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name) { + int r; + + assert(s); + + while (fdset_size(fds) > 0) { + _cleanup_close_ int fd = -1; + + fd = fdset_steal_first(fds); + if (fd < 0) + break; + + r = service_add_fd_store(s, fd, name); + if (r == -EXFULL) + return log_unit_warning_errno(UNIT(s), r, + "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.", + s->n_fd_store_max); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m"); + if (r > 0) + log_unit_debug(UNIT(s), "Added fd %u (%s) to fd store.", fd, strna(name)); + fd = -1; + } + + return 0; +} + +static void service_remove_fd_store(Service *s, const char *name) { + ServiceFDStore *fs, *n; + + assert(s); + assert(name); + + LIST_FOREACH_SAFE(fd_store, fs, n, s->fd_store) { + if (!streq(fs->fdname, name)) + continue; + + log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name); + service_fd_store_unlink(fs); + } +} + +static int service_arm_timer(Service *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + service_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "service-timer"); + + return 0; +} + +static int service_verify(Service *s) { + assert(s); + + if (UNIT(s)->load_state != UNIT_LOADED) + return 0; + + if (!s->exec_command[SERVICE_EXEC_START] && !s->exec_command[SERVICE_EXEC_STOP] + && UNIT(s)->success_action == EMERGENCY_ACTION_NONE) { + /* FailureAction= only makes sense if one of the start or stop commands is specified. + * SuccessAction= will be executed unconditionally if no commands are specified. Hence, + * either a command or SuccessAction= are required. */ + + log_unit_error(UNIT(s), "Service has no ExecStart=, ExecStop=, or SuccessAction=. Refusing."); + return -ENOEXEC; + } + + if (s->type != SERVICE_ONESHOT && !s->exec_command[SERVICE_EXEC_START]) { + log_unit_error(UNIT(s), "Service has no ExecStart= setting, which is only allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (!s->remain_after_exit && !s->exec_command[SERVICE_EXEC_START] && UNIT(s)->success_action == EMERGENCY_ACTION_NONE) { + log_unit_error(UNIT(s), "Service has no ExecStart= and no SuccessAction= settings and does not have RemainAfterExit=yes set. Refusing."); + return -ENOEXEC; + } + + if (s->type != SERVICE_ONESHOT && s->exec_command[SERVICE_EXEC_START]->command_next) { + log_unit_error(UNIT(s), "Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (s->type == SERVICE_ONESHOT && s->restart != SERVICE_RESTART_NO) { + log_unit_error(UNIT(s), "Service has Restart= setting other than no, which isn't allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status)) { + log_unit_error(UNIT(s), "Service has RestartForceStatus= set, which isn't allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (s->type == SERVICE_DBUS && !s->bus_name) { + log_unit_error(UNIT(s), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing."); + return -ENOEXEC; + } + + if (s->bus_name && s->type != SERVICE_DBUS) + log_unit_warning(UNIT(s), "Service has a D-Bus service name specified, but is not of type dbus. Ignoring."); + + if (s->exec_context.pam_name && !IN_SET(s->kill_context.kill_mode, KILL_CONTROL_GROUP, KILL_MIXED)) { + log_unit_error(UNIT(s), "Service has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing."); + return -ENOEXEC; + } + + if (s->usb_function_descriptors && !s->usb_function_strings) + log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring."); + + if (!s->usb_function_descriptors && s->usb_function_strings) + log_unit_warning(UNIT(s), "Service has USBFunctionStrings= setting, but no USBFunctionDescriptors=. Ignoring."); + + if (s->runtime_max_usec != USEC_INFINITY && s->type == SERVICE_ONESHOT) + log_unit_warning(UNIT(s), "MaxRuntimeSec= has no effect in combination with Type=oneshot. Ignoring."); + + return 0; +} + +static int service_add_default_dependencies(Service *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Add a number of automatic dependencies useful for the + * majority of services. */ + + if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) { + /* First, pull in the really early boot stuff, and + * require it, so that we fail if we can't acquire + * it. */ + + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } else { + + /* In the --user instance there's no sysinit.target, + * in that case require basic.target instead. */ + + r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + /* Second, if the rest of the base system is in the same + * transaction, order us after it, but do not pull it in or + * even require it. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + /* Third, add us in for normal shutdown. */ + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static void service_fix_output(Service *s) { + assert(s); + + /* If nothing has been explicitly configured, patch default output in. If input is socket/tty we avoid this + * however, since in that case we want output to default to the same place as we read input from. */ + + if (s->exec_context.std_error == EXEC_OUTPUT_INHERIT && + s->exec_context.std_output == EXEC_OUTPUT_INHERIT && + s->exec_context.std_input == EXEC_INPUT_NULL) + s->exec_context.std_error = UNIT(s)->manager->default_std_error; + + if (s->exec_context.std_output == EXEC_OUTPUT_INHERIT && + s->exec_context.std_input == EXEC_INPUT_NULL) + s->exec_context.std_output = UNIT(s)->manager->default_std_output; + + if (s->exec_context.std_input == EXEC_INPUT_NULL && + s->exec_context.stdin_data_size > 0) + s->exec_context.std_input = EXEC_INPUT_DATA; +} + +static int service_setup_bus_name(Service *s) { + int r; + + assert(s); + + if (!s->bus_name) + return 0; + + r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m"); + + /* We always want to be ordered against dbus.socket if both are in the transaction. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m"); + + r = unit_watch_bus_name(UNIT(s), s->bus_name); + if (r == -EEXIST) + return log_unit_error_errno(UNIT(s), r, "Two services allocated for the same bus name %s, refusing operation.", s->bus_name); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Cannot watch bus name %s: %m", s->bus_name); + + return 0; +} + +static int service_add_extras(Service *s) { + int r; + + assert(s); + + if (s->type == _SERVICE_TYPE_INVALID) { + /* Figure out a type automatically */ + if (s->bus_name) + s->type = SERVICE_DBUS; + else if (s->exec_command[SERVICE_EXEC_START]) + s->type = SERVICE_SIMPLE; + else + s->type = SERVICE_ONESHOT; + } + + /* Oneshot services have disabled start timeout by default */ + if (s->type == SERVICE_ONESHOT && !s->start_timeout_defined) + s->timeout_start_usec = USEC_INFINITY; + + service_fix_output(s); + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(UNIT(s), &s->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + /* If the service needs the notify socket, let's enable it automatically. */ + if (s->notify_access == NOTIFY_NONE && + (s->type == SERVICE_NOTIFY || s->watchdog_usec > 0 || s->n_fd_store_max > 0)) + s->notify_access = NOTIFY_MAIN; + + r = service_add_default_dependencies(s); + if (r < 0) + return r; + + r = service_setup_bus_name(s); + if (r < 0) + return r; + + return 0; +} + +static int service_load(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + + /* Load a .service file */ + r = unit_load_fragment(u); + if (r < 0) + return r; + + /* Still nothing found? Then let's give up */ + if (u->load_state == UNIT_STUB) + return -ENOENT; + + /* This is a new unit? Then let's add in some extras */ + if (u->load_state == UNIT_LOADED) { + + /* We were able to load something, then let's add in + * the dropin directories. */ + r = unit_load_dropin(u); + if (r < 0) + return r; + + /* This is a new unit? Then let's add in some + * extras */ + r = service_add_extras(s); + if (r < 0) + return r; + } + + return service_verify(s); +} + +static void service_dump(Unit *u, FILE *f, const char *prefix) { + char buf_restart[FORMAT_TIMESPAN_MAX], buf_start[FORMAT_TIMESPAN_MAX], buf_stop[FORMAT_TIMESPAN_MAX]; + char buf_runtime[FORMAT_TIMESPAN_MAX], buf_watchdog[FORMAT_TIMESPAN_MAX]; + ServiceExecCommand c; + Service *s = SERVICE(u); + const char *prefix2; + + assert(s); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%sService State: %s\n" + "%sResult: %s\n" + "%sReload Result: %s\n" + "%sPermissionsStartOnly: %s\n" + "%sRootDirectoryStartOnly: %s\n" + "%sRemainAfterExit: %s\n" + "%sGuessMainPID: %s\n" + "%sType: %s\n" + "%sRestart: %s\n" + "%sNotifyAccess: %s\n" + "%sNotifyState: %s\n", + prefix, service_state_to_string(s->state), + prefix, service_result_to_string(s->result), + prefix, service_result_to_string(s->reload_result), + prefix, yes_no(s->permissions_start_only), + prefix, yes_no(s->root_directory_start_only), + prefix, yes_no(s->remain_after_exit), + prefix, yes_no(s->guess_main_pid), + prefix, service_type_to_string(s->type), + prefix, service_restart_to_string(s->restart), + prefix, notify_access_to_string(s->notify_access), + prefix, notify_state_to_string(s->notify_state)); + + if (s->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid); + + if (s->main_pid > 0) + fprintf(f, + "%sMain PID: "PID_FMT"\n" + "%sMain PID Known: %s\n" + "%sMain PID Alien: %s\n", + prefix, s->main_pid, + prefix, yes_no(s->main_pid_known), + prefix, yes_no(s->main_pid_alien)); + + if (s->pid_file) + fprintf(f, + "%sPIDFile: %s\n", + prefix, s->pid_file); + + if (s->bus_name) + fprintf(f, + "%sBusName: %s\n" + "%sBus Name Good: %s\n", + prefix, s->bus_name, + prefix, yes_no(s->bus_name_good)); + + if (UNIT_ISSET(s->accept_socket)) + fprintf(f, + "%sAccept Socket: %s\n", + prefix, UNIT_DEREF(s->accept_socket)->id); + + fprintf(f, + "%sRestartSec: %s\n" + "%sTimeoutStartSec: %s\n" + "%sTimeoutStopSec: %s\n" + "%sRuntimeMaxSec: %s\n" + "%sWatchdogSec: %s\n", + prefix, format_timespan(buf_restart, sizeof(buf_restart), s->restart_usec, USEC_PER_SEC), + prefix, format_timespan(buf_start, sizeof(buf_start), s->timeout_start_usec, USEC_PER_SEC), + prefix, format_timespan(buf_stop, sizeof(buf_stop), s->timeout_stop_usec, USEC_PER_SEC), + prefix, format_timespan(buf_runtime, sizeof(buf_runtime), s->runtime_max_usec, USEC_PER_SEC), + prefix, format_timespan(buf_watchdog, sizeof(buf_watchdog), s->watchdog_usec, USEC_PER_SEC)); + + kill_context_dump(&s->kill_context, f, prefix); + exec_context_dump(&s->exec_context, f, prefix); + + for (c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) { + + if (!s->exec_command[c]) + continue; + + fprintf(f, "%s-> %s:\n", + prefix, service_exec_command_to_string(c)); + + exec_command_dump_list(s->exec_command[c], f, prefix2); + } + + if (s->status_text) + fprintf(f, "%sStatus Text: %s\n", + prefix, s->status_text); + + if (s->n_fd_store_max > 0) + fprintf(f, + "%sFile Descriptor Store Max: %u\n" + "%sFile Descriptor Store Current: %zu\n", + prefix, s->n_fd_store_max, + prefix, s->n_fd_store); + + cgroup_context_dump(&s->cgroup_context, f, prefix); +} + +static int service_is_suitable_main_pid(Service *s, pid_t pid, int prio) { + Unit *owner; + + assert(s); + assert(pid_is_valid(pid)); + + /* Checks whether the specified PID is suitable as main PID for this service. returns negative if not, 0 if the + * PID is questionnable but should be accepted if the source of configuration is trusted. > 0 if the PID is + * good */ + + if (pid == getpid_cached() || pid == 1) { + log_unit_full(UNIT(s), prio, 0, "New main PID "PID_FMT" is the manager, refusing.", pid); + return -EPERM; + } + + if (pid == s->control_pid) { + log_unit_full(UNIT(s), prio, 0, "New main PID "PID_FMT" is the control process, refusing.", pid); + return -EPERM; + } + + if (!pid_is_alive(pid)) { + log_unit_full(UNIT(s), prio, 0, "New main PID "PID_FMT" does not exist or is a zombie.", pid); + return -ESRCH; + } + + owner = manager_get_unit_by_pid(UNIT(s)->manager, pid); + if (owner == UNIT(s)) { + log_unit_debug(UNIT(s), "New main PID "PID_FMT" belongs to service, we are happy.", pid); + return 1; /* Yay, it's definitely a good PID */ + } + + return 0; /* Hmm it's a suspicious PID, let's accept it if configuration source is trusted */ +} + +static int service_load_pid_file(Service *s, bool may_warn) { + char procfs[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; + bool questionable_pid_file = false; + _cleanup_free_ char *k = NULL; + _cleanup_close_ int fd = -1; + int r, prio; + pid_t pid; + + assert(s); + + if (!s->pid_file) + return -ENOENT; + + prio = may_warn ? LOG_INFO : LOG_DEBUG; + + fd = chase_symlinks(s->pid_file, NULL, CHASE_OPEN|CHASE_SAFE, NULL); + if (fd == -ENOLINK) { + log_unit_full(UNIT(s), LOG_DEBUG, fd, "Potentially unsafe symlink chain, will now retry with relaxed checks: %s", s->pid_file); + + questionable_pid_file = true; + + fd = chase_symlinks(s->pid_file, NULL, CHASE_OPEN, NULL); + } + if (fd < 0) + return log_unit_full(UNIT(s), prio, fd, "Can't open PID file %s (yet?) after %s: %m", s->pid_file, service_state_to_string(s->state)); + + /* Let's read the PID file now that we chased it down. But we need to convert the O_PATH fd chase_symlinks() returned us into a proper fd first. */ + xsprintf(procfs, "/proc/self/fd/%i", fd); + r = read_one_line_file(procfs, &k); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Can't convert PID files %s O_PATH file descriptor to proper file descriptor: %m", s->pid_file); + + r = parse_pid(k, &pid); + if (r < 0) + return log_unit_full(UNIT(s), prio, r, "Failed to parse PID from file %s: %m", s->pid_file); + + if (s->main_pid_known && pid == s->main_pid) + return 0; + + r = service_is_suitable_main_pid(s, pid, prio); + if (r < 0) + return r; + if (r == 0) { + struct stat st; + + if (questionable_pid_file) { + log_unit_error(UNIT(s), "Refusing to accept PID outside of service control group, acquired through unsafe symlink chain: %s", s->pid_file); + return -EPERM; + } + + /* Hmm, it's not clear if the new main PID is safe. Let's allow this if the PID file is owned by root */ + + if (fstat(fd, &st) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to fstat() PID file O_PATH fd: %m"); + + if (st.st_uid != 0) { + log_unit_error(UNIT(s), "New main PID "PID_FMT" does not belong to service, and PID file is not owned by root. Refusing.", pid); + return -EPERM; + } + + log_unit_debug(UNIT(s), "New main PID "PID_FMT" does not belong to service, but we'll accept it since PID file is owned by root.", pid); + } + + if (s->main_pid_known) { + log_unit_debug(UNIT(s), "Main PID changing: "PID_FMT" -> "PID_FMT, s->main_pid, pid); + + service_unwatch_main_pid(s); + s->main_pid_known = false; + } else + log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pid); + + r = service_set_main_pid(s, pid); + if (r < 0) + return r; + + r = unit_watch_pid(UNIT(s), pid); + if (r < 0) /* FIXME: we need to do something here */ + return log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" for service: %m", pid); + + return 1; +} + +static void service_search_main_pid(Service *s) { + pid_t pid = 0; + int r; + + assert(s); + + /* If we know it anyway, don't ever fallback to unreliable + * heuristics */ + if (s->main_pid_known) + return; + + if (!s->guess_main_pid) + return; + + assert(s->main_pid <= 0); + + if (unit_search_main_pid(UNIT(s), &pid) < 0) + return; + + log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid); + if (service_set_main_pid(s, pid) < 0) + return; + + r = unit_watch_pid(UNIT(s), pid); + if (r < 0) + /* FIXME: we need to do something here */ + log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" from: %m", pid); +} + +static void service_set_state(Service *s, ServiceState state) { + ServiceState old_state; + const UnitActiveState *table; + + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + + old_state = s->state; + s->state = state; + + service_unwatch_pid_file(s); + + if (!IN_SET(state, + SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_AUTO_RESTART)) + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + if (!IN_SET(state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { + service_unwatch_main_pid(s); + s->main_command = NULL; + } + + if (!IN_SET(state, + SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { + service_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + } + + if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { + unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } + + if (!IN_SET(state, + SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) && + !(state == SERVICE_DEAD && UNIT(s)->job)) + service_close_socket_fd(s); + + if (state != SERVICE_START) + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + service_stop_watchdog(s); + + /* For the inactive states unit_notify() will trim the cgroup, + * but for exit we have to do that ourselves... */ + if (state == SERVICE_EXITED && !MANAGER_IS_RELOADING(UNIT(s)->manager)) + unit_prune_cgroup(UNIT(s)); + + if (old_state != state) + log_unit_debug(UNIT(s), "Changed %s -> %s", service_state_to_string(old_state), service_state_to_string(state)); + + unit_notify(UNIT(s), table[old_state], table[state], + (s->reload_result == SERVICE_SUCCESS ? 0 : UNIT_NOTIFY_RELOAD_FAILURE) | + (s->will_auto_restart ? UNIT_NOTIFY_WILL_AUTO_RESTART : 0)); +} + +static usec_t service_coldplug_timeout(Service *s) { + assert(s); + + switch (s->deserialized_state) { + + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_RELOAD: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_start_usec); + + case SERVICE_RUNNING: + return usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec); + + case SERVICE_STOP: + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec); + + case SERVICE_AUTO_RESTART: + return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, s->restart_usec); + + default: + return USEC_INFINITY; + } +} + +static int service_coldplug(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + assert(s->state == SERVICE_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + r = service_arm_timer(s, service_coldplug_timeout(s)); + if (r < 0) + return r; + + if (s->main_pid > 0 && + pid_is_unwaited(s->main_pid) && + (IN_SET(s->deserialized_state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) { + r = unit_watch_pid(UNIT(s), s->main_pid); + if (r < 0) + return r; + } + + if (s->control_pid > 0 && + pid_is_unwaited(s->control_pid) && + IN_SET(s->deserialized_state, + SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { + r = unit_watch_pid(UNIT(s), s->control_pid); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { + (void) unit_enqueue_rewatch_pids(u); + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + service_start_watchdog(s); + + if (UNIT_ISSET(s->accept_socket)) { + Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket)); + + if (socket->max_connections_per_source > 0) { + SocketPeer *peer; + + /* Make a best-effort attempt at bumping the connection count */ + if (socket_acquire_peer(socket, s->socket_fd, &peer) > 0) { + socket_peer_unref(s->peer); + s->peer = peer; + } + } + } + + service_set_state(s, s->deserialized_state); + return 0; +} + +static int service_collect_fds( + Service *s, + int **fds, + char ***fd_names, + size_t *n_socket_fds, + size_t *n_storage_fds) { + + _cleanup_strv_free_ char **rfd_names = NULL; + _cleanup_free_ int *rfds = NULL; + size_t rn_socket_fds = 0, rn_storage_fds = 0; + int r; + + assert(s); + assert(fds); + assert(fd_names); + assert(n_socket_fds); + assert(n_storage_fds); + + if (s->socket_fd >= 0) { + + /* Pass the per-connection socket */ + + rfds = new(int, 1); + if (!rfds) + return -ENOMEM; + rfds[0] = s->socket_fd; + + rfd_names = strv_new("connection"); + if (!rfd_names) + return -ENOMEM; + + rn_socket_fds = 1; + } else { + Iterator i; + void *v; + Unit *u; + + /* Pass all our configured sockets for singleton services */ + + HASHMAP_FOREACH_KEY(v, u, UNIT(s)->dependencies[UNIT_TRIGGERED_BY], i) { + _cleanup_free_ int *cfds = NULL; + Socket *sock; + int cn_fds; + + if (u->type != UNIT_SOCKET) + continue; + + sock = SOCKET(u); + + cn_fds = socket_collect_fds(sock, &cfds); + if (cn_fds < 0) + return cn_fds; + + if (cn_fds <= 0) + continue; + + if (!rfds) { + rfds = TAKE_PTR(cfds); + rn_socket_fds = cn_fds; + } else { + int *t; + + t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int)); + if (!t) + return -ENOMEM; + + memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int)); + + rfds = t; + rn_socket_fds += cn_fds; + } + + r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds); + if (r < 0) + return r; + } + } + + if (s->n_fd_store > 0) { + ServiceFDStore *fs; + size_t n_fds; + char **nl; + int *t; + + t = reallocarray(rfds, rn_socket_fds + s->n_fd_store, sizeof(int)); + if (!t) + return -ENOMEM; + + rfds = t; + + nl = reallocarray(rfd_names, rn_socket_fds + s->n_fd_store + 1, sizeof(char *)); + if (!nl) + return -ENOMEM; + + rfd_names = nl; + n_fds = rn_socket_fds; + + LIST_FOREACH(fd_store, fs, s->fd_store) { + rfds[n_fds] = fs->fd; + rfd_names[n_fds] = strdup(strempty(fs->fdname)); + if (!rfd_names[n_fds]) + return -ENOMEM; + + rn_storage_fds++; + n_fds++; + } + + rfd_names[n_fds] = NULL; + } + + *fds = TAKE_PTR(rfds); + *fd_names = TAKE_PTR(rfd_names); + *n_socket_fds = rn_socket_fds; + *n_storage_fds = rn_storage_fds; + + return 0; +} + +static int service_allocate_exec_fd_event_source( + Service *s, + int fd, + sd_event_source **ret_event_source) { + + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + assert(fd >= 0); + assert(ret_event_source); + + r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m"); + + /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */ + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m"); + + (void) sd_event_source_set_description(source, "service event_fd"); + + r = sd_event_source_set_io_fd_own(source, true); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m"); + + *ret_event_source = TAKE_PTR(source); + return 0; +} + +static int service_allocate_exec_fd( + Service *s, + sd_event_source **ret_event_source, + int* ret_exec_fd) { + + _cleanup_close_pair_ int p[2] = { -1, -1 }; + int r; + + assert(s); + assert(ret_event_source); + assert(ret_exec_fd); + + if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m"); + + r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source); + if (r < 0) + return r; + + p[0] = -1; + *ret_exec_fd = TAKE_FD(p[1]); + + return 0; +} + +static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) { + assert(s); + + /* Notifications are accepted depending on the process and + * the access setting of the service: + * process: \ access: NONE MAIN EXEC ALL + * main no yes yes yes + * control no no yes yes + * other (forked) no no no yes */ + + if (flags & EXEC_IS_CONTROL) + /* A control process */ + return IN_SET(s->notify_access, NOTIFY_EXEC, NOTIFY_ALL); + + /* We only spawn main processes and control processes, so any + * process that is not a control process is a main process */ + return s->notify_access != NOTIFY_NONE; +} + +static int service_spawn( + Service *s, + ExecCommand *c, + usec_t timeout, + ExecFlags flags, + pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = flags, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL; + size_t n_socket_fds = 0, n_storage_fds = 0, n_env = 0; + _cleanup_close_ int exec_fd = -1; + _cleanup_free_ int *fds = NULL; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(s)); /* This realizes the cgroup, among other things */ + if (r < 0) + return r; + + if (flags & EXEC_IS_CONTROL) { + /* If this is a control process, mask the permissions/chroot application if this is requested. */ + if (s->permissions_start_only) + exec_params.flags &= ~EXEC_APPLY_SANDBOXING; + if (s->root_directory_start_only) + exec_params.flags &= ~EXEC_APPLY_CHROOT; + } + + if ((flags & EXEC_PASS_FDS) || + s->exec_context.std_input == EXEC_INPUT_SOCKET || + s->exec_context.std_output == EXEC_OUTPUT_SOCKET || + s->exec_context.std_error == EXEC_OUTPUT_SOCKET) { + + r = service_collect_fds(s, &fds, &fd_names, &n_socket_fds, &n_storage_fds); + if (r < 0) + return r; + + log_unit_debug(UNIT(s), "Passing %zu fds to service", n_socket_fds + n_storage_fds); + } + + if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + assert(!s->exec_fd_event_source); + + r = service_allocate_exec_fd(s, &exec_fd_source, &exec_fd); + if (r < 0) + return r; + } + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout)); + if (r < 0) + return r; + + our_env = new0(char*, 9); + if (!our_env) + return -ENOMEM; + + if (service_exec_needs_notify_socket(s, flags)) + if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0) + return -ENOMEM; + + if (s->main_pid > 0) + if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid) < 0) + return -ENOMEM; + + if (MANAGER_IS_USER(UNIT(s)->manager)) + if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + + if (s->socket_fd >= 0) { + union sockaddr_union sa; + socklen_t salen = sizeof(sa); + + /* If this is a per-connection service instance, let's set $REMOTE_ADDR and $REMOTE_PORT to something + * useful. Note that we do this only when we are still connected at this point in time, which we might + * very well not be. Hence we ignore all errors when retrieving peer information (as that might result + * in ENOTCONN), and just use whate we can use. */ + + if (getpeername(s->socket_fd, &sa.sa, &salen) >= 0 && + IN_SET(sa.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + + _cleanup_free_ char *addr = NULL; + char *t; + unsigned port; + + r = sockaddr_pretty(&sa.sa, salen, true, false, &addr); + if (r < 0) + return r; + + t = strappend("REMOTE_ADDR=", addr); + if (!t) + return -ENOMEM; + our_env[n_env++] = t; + + r = sockaddr_port(&sa.sa, &port); + if (r < 0) + return r; + + if (asprintf(&t, "REMOTE_PORT=%u", port) < 0) + return -ENOMEM; + our_env[n_env++] = t; + } + } + + if (flags & EXEC_SETENV_RESULT) { + if (asprintf(our_env + n_env++, "SERVICE_RESULT=%s", service_result_to_string(s->result)) < 0) + return -ENOMEM; + + if (s->main_exec_status.pid > 0 && + dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { + if (asprintf(our_env + n_env++, "EXIT_CODE=%s", sigchld_code_to_string(s->main_exec_status.code)) < 0) + return -ENOMEM; + + if (s->main_exec_status.code == CLD_EXITED) + r = asprintf(our_env + n_env++, "EXIT_STATUS=%i", s->main_exec_status.status); + else + r = asprintf(our_env + n_env++, "EXIT_STATUS=%s", signal_to_string(s->main_exec_status.status)); + if (r < 0) + return -ENOMEM; + } + } + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + final_env = strv_env_merge(2, exec_params.environment, our_env, NULL); + if (!final_env) + return -ENOMEM; + + /* System D-Bus needs nss-systemd disabled, so that we don't deadlock */ + SET_FLAG(exec_params.flags, EXEC_NSS_BYPASS_BUS, + MANAGER_IS_SYSTEM(UNIT(s)->manager) && unit_has_name(UNIT(s), SPECIAL_DBUS_SERVICE)); + + strv_free_and_replace(exec_params.environment, final_env); + exec_params.fds = fds; + exec_params.fd_names = fd_names; + exec_params.n_socket_fds = n_socket_fds; + exec_params.n_storage_fds = n_storage_fds; + exec_params.watchdog_usec = service_get_watchdog_usec(s); + exec_params.selinux_context_net = s->socket_fd_selinux_context_net; + if (s->type == SERVICE_IDLE) + exec_params.idle_pipe = UNIT(s)->manager->idle_pipe; + exec_params.stdin_fd = s->stdin_fd; + exec_params.stdout_fd = s->stdout_fd; + exec_params.stderr_fd = s->stderr_fd; + exec_params.exec_fd = exec_fd; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->dynamic_creds, + &pid); + if (r < 0) + return r; + + s->exec_fd_event_source = TAKE_PTR(exec_fd_source); + s->exec_fd_hot = false; + + r = unit_watch_pid(UNIT(s), pid); + if (r < 0) /* FIXME: we need to do something here */ + return r; + + *_pid = pid; + + return 0; +} + +static int main_pid_good(Service *s) { + assert(s); + + /* Returns 0 if the pid is dead, > 0 if it is good, < 0 if we don't know */ + + /* If we know the pid file, then let's just check if it is + * still valid */ + if (s->main_pid_known) { + + /* If it's an alien child let's check if it is still + * alive ... */ + if (s->main_pid_alien && s->main_pid > 0) + return pid_is_alive(s->main_pid); + + /* .. otherwise assume we'll get a SIGCHLD for it, + * which we really should wait for to collect exit + * status and code */ + return s->main_pid > 0; + } + + /* We don't know the pid */ + return -EAGAIN; +} + +static int control_pid_good(Service *s) { + assert(s); + + /* Returns 0 if the control PID is dead, > 0 if it is good. We never actually return < 0 here, but in order to + * make this function as similar as possible to main_pid_good() and cgroup_good(), we pretend that < 0 also + * means: we can't figure it out. */ + + return s->control_pid > 0; +} + +static int cgroup_good(Service *s) { + int r; + + assert(s); + + /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't + * figure it out */ + + if (!UNIT(s)->cgroup_path) + return 0; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path); + if (r < 0) + return r; + + return r == 0; +} + +static bool service_shall_restart(Service *s) { + assert(s); + + /* Don't restart after manual stops */ + if (s->forbid_restart) + return false; + + /* Never restart if this is configured as special exception */ + if (exit_status_set_test(&s->restart_prevent_status, s->main_exec_status.code, s->main_exec_status.status)) + return false; + + /* Restart if the exit code/status are configured as restart triggers */ + if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) + return true; + + switch (s->restart) { + + case SERVICE_RESTART_NO: + return false; + + case SERVICE_RESTART_ALWAYS: + return true; + + case SERVICE_RESTART_ON_SUCCESS: + return s->result == SERVICE_SUCCESS; + + case SERVICE_RESTART_ON_FAILURE: + return s->result != SERVICE_SUCCESS; + + case SERVICE_RESTART_ON_ABNORMAL: + return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_FAILURE_EXIT_CODE); + + case SERVICE_RESTART_ON_WATCHDOG: + return s->result == SERVICE_FAILURE_WATCHDOG; + + case SERVICE_RESTART_ON_ABORT: + return IN_SET(s->result, SERVICE_FAILURE_SIGNAL, SERVICE_FAILURE_CORE_DUMP); + + default: + assert_not_reached("unknown restart setting"); + } +} + +static bool service_will_restart(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (s->will_auto_restart) + return true; + if (s->state == SERVICE_AUTO_RESTART) + return true; + if (!UNIT(s)->job) + return false; + if (UNIT(s)->job->type == JOB_START) + return true; + + return false; +} + +static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) { + int r; + + assert(s); + + /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not + * undo what has already been enqueued. */ + if (unit_stop_pending(UNIT(s))) + allow_restart = false; + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SERVICE_SUCCESS, service_result_to_string(s->result)); + + if (allow_restart && service_shall_restart(s)) + s->will_auto_restart = true; + + /* Make sure service_release_resources() doesn't destroy our FD store, while we are changing through + * SERVICE_FAILED/SERVICE_DEAD before entering into SERVICE_AUTO_RESTART. */ + s->n_keep_fd_store ++; + + service_set_state(s, s->result != SERVICE_SUCCESS ? SERVICE_FAILED : SERVICE_DEAD); + + if (s->will_auto_restart) { + s->will_auto_restart = false; + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->restart_usec)); + if (r < 0) { + s->n_keep_fd_store--; + goto fail; + } + + service_set_state(s, SERVICE_AUTO_RESTART); + } else + /* If we shan't restart, then flush out the restart counter. But don't do that immediately, so that the + * user can still introspect the counter. Do so on the next start. */ + s->flush_n_restarts = true; + + /* The new state is in effect, let's decrease the fd store ref counter again. Let's also readd us to the GC + * queue, so that the fd store is possibly gc'ed again */ + s->n_keep_fd_store--; + unit_add_to_gc_queue(UNIT(s)); + + /* The next restart might not be a manual stop, hence reset the flag indicating manual stops */ + s->forbid_restart = false; + + /* We want fresh tmpdirs in case service is started again immediately */ + s->exec_runtime = exec_runtime_unref(s->exec_runtime, true); + + if (s->exec_context.runtime_directory_preserve_mode == EXEC_PRESERVE_NO || + (s->exec_context.runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART && !service_will_restart(UNIT(s)))) + /* Also, remove the runtime directory */ + exec_context_destroy_runtime_directory(&s->exec_context, UNIT(s)->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + /* Get rid of the IPC bits of the user */ + unit_unref_uid_gid(UNIT(s), true); + + /* Release the user, and destroy it if we are the only remaining owner */ + dynamic_creds_destroy(&s->dynamic_creds); + + /* Try to delete the pid file. At this point it will be + * out-of-date, and some software might be confused by it, so + * let's remove it. */ + if (s->pid_file) + (void) unlink(s->pid_file); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run install restart timer: %m"); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, false); +} + +static void service_enter_stop_post(Service *s, ServiceResult f) { + int r; + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_STOP_POST; + + r = service_spawn(s, + s->control_command, + s->timeout_stop_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_STOP_POST); + } else + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-post' task: %m"); + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static int state_to_kill_operation(ServiceState state) { + switch (state) { + + case SERVICE_STOP_WATCHDOG: + return KILL_WATCHDOG; + + case SERVICE_STOP_SIGTERM: + case SERVICE_FINAL_SIGTERM: + return KILL_TERMINATE; + + case SERVICE_STOP_SIGKILL: + case SERVICE_FINAL_SIGKILL: + return KILL_KILL; + + default: + return _KILL_OPERATION_INVALID; + } +} + +static void service_enter_signal(Service *s, ServiceState state, ServiceResult f) { + int r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state_to_kill_operation(state), + s->main_pid, + s->control_pid, + s->main_pid_alien); + if (r < 0) + goto fail; + + if (r > 0) { + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec)); + if (r < 0) + goto fail; + + service_set_state(s, state); + } else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM) && s->kill_context.send_sigkill) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS); + else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL)) + service_enter_stop_post(s, SERVICE_SUCCESS); + else if (state == SERVICE_FINAL_SIGTERM && s->kill_context.send_sigkill) + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS); + else + service_enter_dead(s, SERVICE_SUCCESS, true); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + + if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL)) + service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES); + else + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); +} + +static void service_enter_stop_by_notify(Service *s) { + assert(s); + + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec)); + + /* The service told us it's stopping, so it's as if we SIGTERM'd it. */ + service_set_state(s, SERVICE_STOP_SIGTERM); +} + +static void service_enter_stop(Service *s, ServiceResult f) { + int r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + s->control_command = s->exec_command[SERVICE_EXEC_STOP]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_STOP; + + r = service_spawn(s, + s->control_command, + s->timeout_stop_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_STOP); + } else + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop' task: %m"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static bool service_good(Service *s) { + int main_pid_ok; + assert(s); + + if (s->type == SERVICE_DBUS && !s->bus_name_good) + return false; + + main_pid_ok = main_pid_good(s); + if (main_pid_ok > 0) /* It's alive */ + return true; + if (main_pid_ok == 0) /* It's dead */ + return false; + + /* OK, we don't know anything about the main PID, maybe + * because there is none. Let's check the control group + * instead. */ + + return cgroup_good(s) != 0; +} + +static void service_enter_running(Service *s, ServiceResult f) { + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + + if (s->result != SERVICE_SUCCESS) + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + else if (service_good(s)) { + + /* If there are any queued up sd_notify() notifications, process them now */ + if (s->notify_state == NOTIFY_RELOADING) + service_enter_reload_by_notify(s); + else if (s->notify_state == NOTIFY_STOPPING) + service_enter_stop_by_notify(s); + else { + service_set_state(s, SERVICE_RUNNING); + service_arm_timer(s, usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec)); + } + + } else if (s->remain_after_exit) + service_set_state(s, SERVICE_EXITED); + else + service_enter_stop(s, SERVICE_SUCCESS); +} + +static void service_enter_start_post(Service *s) { + int r; + assert(s); + + service_unwatch_control_pid(s); + service_reset_watchdog(s); + + s->control_command = s->exec_command[SERVICE_EXEC_START_POST]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_START_POST; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_START_POST); + } else + service_enter_running(s, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-post' task: %m"); + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); +} + +static void service_kill_control_process(Service *s) { + int r; + + assert(s); + + if (s->control_pid <= 0) + return; + + r = kill_and_sigcont(s->control_pid, SIGKILL); + if (r < 0) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(s->control_pid, &comm); + + log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", + s->control_pid, strna(comm)); + } +} + +static void service_enter_start(Service *s) { + ExecCommand *c; + usec_t timeout; + pid_t pid; + int r; + + assert(s); + + service_unwatch_control_pid(s); + service_unwatch_main_pid(s); + + unit_warn_leftover_processes(UNIT(s)); + + if (s->type == SERVICE_FORKING) { + s->control_command_id = SERVICE_EXEC_START; + c = s->control_command = s->exec_command[SERVICE_EXEC_START]; + + s->main_command = NULL; + } else { + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + s->control_command = NULL; + + c = s->main_command = s->exec_command[SERVICE_EXEC_START]; + } + + if (!c) { + if (s->type != SERVICE_ONESHOT) { + /* There's no command line configured for the main command? Hmm, that is strange. This can only + * happen if the configuration changes at runtime. In this case, let's enter a failure + * state. */ + log_unit_error(UNIT(s), "There's no 'start' task anymore we could start."); + r = -ENXIO; + goto fail; + } + + /* We force a fake state transition here. Otherwise, the unit would go directly from + * SERVICE_DEAD to SERVICE_DEAD without SERVICE_ACTIVATING or SERVICE_ACTIVE + * inbetween. This way we can later trigger actions that depend on the state + * transition, including SuccessAction=. */ + service_set_state(s, SERVICE_START); + + service_enter_start_post(s); + return; + } + + if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) + /* For simple + idle this is the main process. We don't apply any timeout here, but + * service_enter_running() will later apply the .runtime_max_usec timeout. */ + timeout = USEC_INFINITY; + else + timeout = s->timeout_start_usec; + + r = service_spawn(s, + c, + timeout, + EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG, + &pid); + if (r < 0) + goto fail; + + if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) { + /* For simple services we immediately start + * the START_POST binaries. */ + + service_set_main_pid(s, pid); + service_enter_start_post(s); + + } else if (s->type == SERVICE_FORKING) { + + /* For forking services we wait until the start + * process exited. */ + + s->control_pid = pid; + service_set_state(s, SERVICE_START); + + } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) { + + /* For oneshot services we wait until the start process exited, too, but it is our main process. */ + + /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the + * bus. 'notify' and 'exec' services are similar. */ + + service_set_main_pid(s, pid); + service_set_state(s, SERVICE_START); + } else + assert_not_reached("Unknown service type"); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start' task: %m"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static void service_enter_start_pre(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + + s->control_command = s->exec_command[SERVICE_EXEC_START_PRE]; + if (s->control_command) { + + unit_warn_leftover_processes(UNIT(s)); + + s->control_command_id = SERVICE_EXEC_START_PRE; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_START_PRE); + } else + service_enter_start(s); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-pre' task: %m"); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); +} + +static void service_enter_restart(Service *s) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + if (UNIT(s)->job && UNIT(s)->job->type == JOB_STOP) { + /* Don't restart things if we are going down anyway */ + log_unit_info(UNIT(s), "Stop job pending for unit, delaying automatic restart."); + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->restart_usec)); + if (r < 0) + goto fail; + + return; + } + + /* Any units that are bound to this service must also be + * restarted. We use JOB_RESTART (instead of the more obvious + * JOB_START) here so that those dependency jobs will be added + * as well. */ + r = manager_add_job(UNIT(s)->manager, JOB_RESTART, UNIT(s), JOB_REPLACE, &error, NULL); + if (r < 0) + goto fail; + + /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't fully + * stopped, i.e. as long as it remains up or remains in auto-start states. The use can reset the counter + * explicitly however via the usual "systemctl reset-failure" logic. */ + s->n_restarts ++; + s->flush_n_restarts = false; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR, + LOG_UNIT_ID(UNIT(s)), + LOG_UNIT_INVOCATION_ID(UNIT(s)), + LOG_UNIT_MESSAGE(UNIT(s), "Scheduled restart job, restart counter is at %u.", s->n_restarts), + "N_RESTARTS=%u", s->n_restarts); + + /* Notify clients about changed restart counter */ + unit_add_to_dbus_queue(UNIT(s)); + + /* Note that we stay in the SERVICE_AUTO_RESTART state here, + * it will be canceled as part of the service_stop() call that + * is executed as part of JOB_RESTART. */ + + return; + +fail: + log_unit_warning(UNIT(s), "Failed to schedule restart job: %s", bus_error_message(&error, -r)); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, false); +} + +static void service_enter_reload_by_notify(Service *s) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_start_usec)); + service_set_state(s, SERVICE_RELOAD); + + /* service_enter_reload_by_notify is never called during a reload, thus no loops are possible. */ + r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error); + if (r < 0) + log_unit_warning(UNIT(s), "Failed to schedule propagation of reload: %s", bus_error_message(&error, -r)); +} + +static void service_enter_reload(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + s->reload_result = SERVICE_SUCCESS; + + s->control_command = s->exec_command[SERVICE_EXEC_RELOAD]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_RELOAD; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_RELOAD); + } else + service_enter_running(s, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'reload' task: %m"); + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); +} + +static void service_run_next_control(Service *s) { + usec_t timeout; + int r; + + assert(s); + assert(s->control_command); + assert(s->control_command->command_next); + + assert(s->control_command_id != SERVICE_EXEC_START); + + s->control_command = s->control_command->command_next; + service_unwatch_control_pid(s); + + if (IN_SET(s->state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + timeout = s->timeout_start_usec; + else + timeout = s->timeout_stop_usec; + + r = service_spawn(s, + s->control_command, + timeout, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL| + (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0), + &s->control_pid); + if (r < 0) + goto fail; + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run next control task: %m"); + + if (IN_SET(s->state, SERVICE_START_PRE, SERVICE_START_POST, SERVICE_STOP)) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + else if (s->state == SERVICE_STOP_POST) + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); + else if (s->state == SERVICE_RELOAD) { + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); + } else + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); +} + +static void service_run_next_main(Service *s) { + pid_t pid; + int r; + + assert(s); + assert(s->main_command); + assert(s->main_command->command_next); + assert(s->type == SERVICE_ONESHOT); + + s->main_command = s->main_command->command_next; + service_unwatch_main_pid(s); + + r = service_spawn(s, + s->main_command, + s->timeout_start_usec, + EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG, + &pid); + if (r < 0) + goto fail; + + service_set_main_pid(s, pid); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run next main task: %m"); + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); +} + +static int service_start(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(s->state, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(s->state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST)) + return 0; + + /* A service that will be restarted must be stopped first to + * trigger BindsTo and/or OnFailure dependencies. If a user + * does not want to wait for the holdoff time to elapse, the + * service should be manually restarted, not started. We + * simply return EAGAIN here, so that any start jobs stay + * queued, and assume that the auto restart timer will + * eventually trigger the restart. */ + if (s->state == SERVICE_AUTO_RESTART) + return -EAGAIN; + + assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED)); + + /* Make sure we don't enter a busy loop of some kind. */ + r = unit_start_limit_test(u); + if (r < 0) { + service_enter_dead(s, SERVICE_FAILURE_START_LIMIT_HIT, false); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + s->result = SERVICE_SUCCESS; + s->reload_result = SERVICE_SUCCESS; + s->main_pid_known = false; + s->main_pid_alien = false; + s->forbid_restart = false; + + s->status_text = mfree(s->status_text); + s->status_errno = 0; + + s->notify_state = NOTIFY_UNKNOWN; + + s->watchdog_original_usec = s->watchdog_usec; + s->watchdog_override_enable = false; + s->watchdog_override_usec = USEC_INFINITY; + + exec_command_reset_status_list_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); + exec_status_reset(&s->main_exec_status); + + /* This is not an automatic restart? Flush the restart counter then */ + if (s->flush_n_restarts) { + s->n_restarts = 0; + s->flush_n_restarts = false; + } + + u->reset_accounting = true; + + service_enter_start_pre(s); + return 1; +} + +static int service_stop(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* Don't create restart jobs from manual stops. */ + s->forbid_restart = true; + + /* Already on it */ + if (IN_SET(s->state, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) + return 0; + + /* A restart will be scheduled or is in progress. */ + if (s->state == SERVICE_AUTO_RESTART) { + service_set_state(s, SERVICE_DEAD); + return 0; + } + + /* If there's already something running we go directly into + * kill mode. */ + if (IN_SET(s->state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RELOAD)) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS); + return 0; + } + + assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED)); + + service_enter_stop(s, SERVICE_SUCCESS); + return 1; +} + +static int service_reload(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED)); + + service_enter_reload(s); + return 1; +} + +_pure_ static bool service_can_reload(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return !!s->exec_command[SERVICE_EXEC_RELOAD]; +} + +static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, ExecCommand *current) { + Service *s = SERVICE(u); + unsigned idx = 0; + ExecCommand *first, *c; + + assert(s); + + first = s->exec_command[id]; + + /* Figure out where we are in the list by walking back to the beginning */ + for (c = current; c != first; c = c->command_prev) + idx++; + + return idx; +} + +static int service_serialize_exec_command(Unit *u, FILE *f, ExecCommand *command) { + _cleanup_free_ char *args = NULL, *p = NULL; + size_t allocated = 0, length = 0; + Service *s = SERVICE(u); + const char *type, *key; + ServiceExecCommand id; + unsigned idx; + char **arg; + + assert(s); + assert(f); + + if (!command) + return 0; + + if (command == s->control_command) { + type = "control"; + id = s->control_command_id; + } else { + type = "main"; + id = SERVICE_EXEC_START; + } + + idx = service_exec_command_index(u, id, command); + + STRV_FOREACH(arg, command->argv) { + _cleanup_free_ char *e = NULL; + size_t n; + + e = cescape(*arg); + if (!e) + return log_oom(); + + n = strlen(e); + if (!GREEDY_REALLOC(args, allocated, length + 1 + n + 1)) + return log_oom(); + + if (length > 0) + args[length++] = ' '; + + memcpy(args + length, e, n); + length += n; + } + + if (!GREEDY_REALLOC(args, allocated, length + 1)) + return log_oom(); + + args[length++] = 0; + + p = cescape(command->path); + if (!p) + return -ENOMEM; + + key = strjoina(type, "-command"); + return serialize_item_format(f, key, "%s %u %s %s", service_exec_command_to_string(id), idx, p, args); +} + +static int service_serialize(Unit *u, FILE *f, FDSet *fds) { + Service *s = SERVICE(u); + ServiceFDStore *fs; + int r; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", service_state_to_string(s->state)); + (void) serialize_item(f, "result", service_result_to_string(s->result)); + (void) serialize_item(f, "reload-result", service_result_to_string(s->reload_result)); + + if (s->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid); + + if (s->main_pid_known && s->main_pid > 0) + (void) serialize_item_format(f, "main-pid", PID_FMT, s->main_pid); + + (void) serialize_bool(f, "main-pid-known", s->main_pid_known); + (void) serialize_bool(f, "bus-name-good", s->bus_name_good); + (void) serialize_bool(f, "bus-name-owner", s->bus_name_owner); + + (void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts); + (void) serialize_bool(f, "flush-n-restarts", s->flush_n_restarts); + + r = serialize_item_escaped(f, "status-text", s->status_text); + if (r < 0) + return r; + + service_serialize_exec_command(u, f, s->control_command); + service_serialize_exec_command(u, f, s->main_command); + + r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd); + if (r < 0) + return r; + + if (s->exec_fd_event_source) { + r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source)); + if (r < 0) + return r; + + (void) serialize_bool(f, "exec-fd-hot", s->exec_fd_hot); + } + + if (UNIT_ISSET(s->accept_socket)) { + r = serialize_item(f, "accept-socket", UNIT_DEREF(s->accept_socket)->id); + if (r < 0) + return r; + } + + r = serialize_fd(f, fds, "socket-fd", s->socket_fd); + if (r < 0) + return r; + + LIST_FOREACH(fd_store, fs, s->fd_store) { + _cleanup_free_ char *c = NULL; + int copy; + + copy = fdset_put_dup(fds, fs->fd); + if (copy < 0) + return log_error_errno(copy, "Failed to copy file descriptor for serialization: %m"); + + c = cescape(fs->fdname); + if (!c) + return log_oom(); + + (void) serialize_item_format(f, "fd-store-fd", "%i %s", copy, c); + } + + if (s->main_exec_status.pid > 0) { + (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid); + (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp); + (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp); + + if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { + (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code); + (void) serialize_item_format(f, "main-exec-status-status", "%i", s->main_exec_status.status); + } + } + + (void) serialize_dual_timestamp(f, "watchdog-timestamp", &s->watchdog_timestamp); + (void) serialize_bool(f, "forbid-restart", s->forbid_restart); + + if (s->watchdog_override_enable) + (void) serialize_item_format(f, "watchdog-override-usec", USEC_FMT, s->watchdog_override_usec); + + if (s->watchdog_original_usec != USEC_INFINITY) + (void) serialize_item_format(f, "watchdog-original-usec", USEC_FMT, s->watchdog_original_usec); + + return 0; +} + +static int service_deserialize_exec_command(Unit *u, const char *key, const char *value) { + Service *s = SERVICE(u); + int r; + unsigned idx = 0, i; + bool control, found = false; + ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID; + ExecCommand *command = NULL; + _cleanup_free_ char *path = NULL; + _cleanup_strv_free_ char **argv = NULL; + + enum ExecCommandState { + STATE_EXEC_COMMAND_TYPE, + STATE_EXEC_COMMAND_INDEX, + STATE_EXEC_COMMAND_PATH, + STATE_EXEC_COMMAND_ARGS, + _STATE_EXEC_COMMAND_MAX, + _STATE_EXEC_COMMAND_INVALID = -1, + } state; + + assert(s); + assert(key); + assert(value); + + control = streq(key, "control-command"); + + state = STATE_EXEC_COMMAND_TYPE; + + for (;;) { + _cleanup_free_ char *arg = NULL; + + r = extract_first_word(&value, &arg, NULL, EXTRACT_CUNESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + switch (state) { + case STATE_EXEC_COMMAND_TYPE: + id = service_exec_command_from_string(arg); + if (id < 0) + return -EINVAL; + + state = STATE_EXEC_COMMAND_INDEX; + break; + case STATE_EXEC_COMMAND_INDEX: + r = safe_atou(arg, &idx); + if (r < 0) + return -EINVAL; + + state = STATE_EXEC_COMMAND_PATH; + break; + case STATE_EXEC_COMMAND_PATH: + path = TAKE_PTR(arg); + state = STATE_EXEC_COMMAND_ARGS; + + if (!path_is_absolute(path)) + return -EINVAL; + break; + case STATE_EXEC_COMMAND_ARGS: + r = strv_extend(&argv, arg); + if (r < 0) + return -ENOMEM; + break; + default: + assert_not_reached("Unknown error at deserialization of exec command"); + break; + } + } + + if (state != STATE_EXEC_COMMAND_ARGS) + return -EINVAL; + + /* Let's check whether exec command on given offset matches data that we just deserialized */ + for (command = s->exec_command[id], i = 0; command; command = command->command_next, i++) { + if (i != idx) + continue; + + found = strv_equal(argv, command->argv) && streq(command->path, path); + break; + } + + if (!found) { + /* Command at the index we serialized is different, let's look for command that exactly + * matches but is on different index. If there is no such command we will not resume execution. */ + for (command = s->exec_command[id]; command; command = command->command_next) + if (strv_equal(command->argv, argv) && streq(command->path, path)) + break; + } + + if (command && control) + s->control_command = command; + else if (command) + s->main_command = command; + else + log_unit_warning(u, "Current command vanished from the unit file, execution of the command list won't be resumed."); + + return 0; +} + +static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Service *s = SERVICE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + ServiceState state; + + state = service_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + ServiceResult f; + + f = service_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SERVICE_SUCCESS) + s->result = f; + + } else if (streq(key, "reload-result")) { + ServiceResult f; + + f = service_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse reload result value: %s", value); + else if (f != SERVICE_SUCCESS) + s->reload_result = f; + + } else if (streq(key, "control-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + else + s->control_pid = pid; + } else if (streq(key, "main-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse main-pid value: %s", value); + else + (void) service_set_main_pid(s, pid); + } else if (streq(key, "main-pid-known")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse main-pid-known value: %s", value); + else + s->main_pid_known = b; + } else if (streq(key, "bus-name-good")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse bus-name-good value: %s", value); + else + s->bus_name_good = b; + } else if (streq(key, "bus-name-owner")) { + r = free_and_strdup(&s->bus_name_owner, value); + if (r < 0) + log_unit_error_errno(u, r, "Unable to deserialize current bus owner %s: %m", value); + } else if (streq(key, "status-text")) { + char *t; + + r = cunescape(value, 0, &t); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to unescape status text '%s': %m", value); + else + free_and_replace(s->status_text, t); + + } else if (streq(key, "accept-socket")) { + Unit *socket; + + r = manager_load_unit(u->manager, value, NULL, NULL, &socket); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to load accept-socket unit '%s': %m", value); + else { + unit_ref_set(&s->accept_socket, u, socket); + SOCKET(socket)->n_connections++; + } + + } else if (streq(key, "socket-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse socket-fd value: %s", value); + else { + asynchronous_close(s->socket_fd); + s->socket_fd = fdset_remove(fds, fd); + } + } else if (streq(key, "fd-store-fd")) { + const char *fdv; + size_t pf; + int fd; + + pf = strcspn(value, WHITESPACE); + fdv = strndupa(value, pf); + + if (safe_atoi(fdv, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse fd-store-fd value: %s", value); + else { + _cleanup_free_ char *t = NULL; + const char *fdn; + + fdn = value + pf; + fdn += strspn(fdn, WHITESPACE); + (void) cunescape(fdn, 0, &t); + + r = service_add_fd_store(s, fd, t); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add fd to store: %m"); + else + fdset_remove(fds, fd); + } + + } else if (streq(key, "main-exec-status-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-pid value: %s", value); + else + s->main_exec_status.pid = pid; + } else if (streq(key, "main-exec-status-code")) { + int i; + + if (safe_atoi(value, &i) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-code value: %s", value); + else + s->main_exec_status.code = i; + } else if (streq(key, "main-exec-status-status")) { + int i; + + if (safe_atoi(value, &i) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-status value: %s", value); + else + s->main_exec_status.status = i; + } else if (streq(key, "main-exec-status-start")) + deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp); + else if (streq(key, "main-exec-status-exit")) + deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp); + else if (streq(key, "watchdog-timestamp")) + deserialize_dual_timestamp(value, &s->watchdog_timestamp); + else if (streq(key, "forbid-restart")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse forbid-restart value: %s", value); + else + s->forbid_restart = b; + } else if (streq(key, "stdin-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse stdin-fd value: %s", value); + else { + asynchronous_close(s->stdin_fd); + s->stdin_fd = fdset_remove(fds, fd); + s->exec_context.stdio_as_fds = true; + } + } else if (streq(key, "stdout-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse stdout-fd value: %s", value); + else { + asynchronous_close(s->stdout_fd); + s->stdout_fd = fdset_remove(fds, fd); + s->exec_context.stdio_as_fds = true; + } + } else if (streq(key, "stderr-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse stderr-fd value: %s", value); + else { + asynchronous_close(s->stderr_fd); + s->stderr_fd = fdset_remove(fds, fd); + s->exec_context.stdio_as_fds = true; + } + } else if (streq(key, "exec-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse exec-fd value: %s", value); + else { + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + fd = fdset_remove(fds, fd); + if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0) + safe_close(fd); + } + } else if (streq(key, "watchdog-override-usec")) { + if (deserialize_usec(value, &s->watchdog_override_usec) < 0) + log_unit_debug(u, "Failed to parse watchdog_override_usec value: %s", value); + else + s->watchdog_override_enable = true; + + } else if (streq(key, "watchdog-original-usec")) { + if (deserialize_usec(value, &s->watchdog_original_usec) < 0) + log_unit_debug(u, "Failed to parse watchdog_original_usec value: %s", value); + + } else if (STR_IN_SET(key, "main-command", "control-command")) { + r = service_deserialize_exec_command(u, key, value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized command \"%s\": %m", value); + + } else if (streq(key, "n-restarts")) { + r = safe_atou(value, &s->n_restarts); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value); + + } else if (streq(key, "flush-n-restarts")) { + r = parse_boolean(value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized flush restart counter setting '%s': %m", value); + else + s->flush_n_restarts = r; + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState service_active_state(Unit *u) { + const UnitActiveState *table; + + assert(u); + + table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + + return table[SERVICE(u)->state]; +} + +static const char *service_sub_state_to_string(Unit *u) { + assert(u); + + return service_state_to_string(SERVICE(u)->state); +} + +static bool service_may_gc(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* Never clean up services that still have a process around, even if the service is formally dead. Note that + * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they + * have moved outside of the cgroup. */ + + if (main_pid_good(s) > 0 || + control_pid_good(s) > 0) + return false; + + return true; +} + +static int service_retry_pid_file(Service *s) { + int r; + + assert(s->pid_file); + assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); + + r = service_load_pid_file(s, false); + if (r < 0) + return r; + + service_unwatch_pid_file(s); + + service_enter_running(s, SERVICE_SUCCESS); + return 0; +} + +static int service_watch_pid_file(Service *s) { + int r; + + log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path); + + r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io); + if (r < 0) + goto fail; + + /* the pidfile might have appeared just before we set the watch */ + log_unit_debug(UNIT(s), "Trying to read PID file %s in case it changed", s->pid_file_pathspec->path); + service_retry_pid_file(s); + + return 0; +fail: + log_unit_error_errno(UNIT(s), r, "Failed to set a watch for PID file %s: %m", s->pid_file_pathspec->path); + service_unwatch_pid_file(s); + return r; +} + +static int service_demand_pid_file(Service *s) { + PathSpec *ps; + + assert(s->pid_file); + assert(!s->pid_file_pathspec); + + ps = new0(PathSpec, 1); + if (!ps) + return -ENOMEM; + + ps->unit = UNIT(s); + ps->path = strdup(s->pid_file); + if (!ps->path) { + free(ps); + return -ENOMEM; + } + + path_simplify(ps->path, false); + + /* PATH_CHANGED would not be enough. There are daemons (sendmail) that + * keep their PID file open all the time. */ + ps->type = PATH_MODIFIED; + ps->inotify_fd = -1; + + s->pid_file_pathspec = ps; + + return service_watch_pid_file(s); +} + +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + PathSpec *p = userdata; + Service *s; + + assert(p); + + s = SERVICE(p->unit); + + assert(s); + assert(fd >= 0); + assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); + assert(s->pid_file_pathspec); + assert(path_spec_owns_inotify_fd(s->pid_file_pathspec, fd)); + + log_unit_debug(UNIT(s), "inotify event"); + + if (path_spec_fd_event(p, events) < 0) + goto fail; + + if (service_retry_pid_file(s) == 0) + return 0; + + if (service_watch_pid_file(s) < 0) + goto fail; + + return 0; + +fail: + service_unwatch_pid_file(s); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + return 0; +} + +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + + log_unit_debug(UNIT(s), "got exec-fd event"); + + /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() + * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically + * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the + * parent. We need to be careful however, as there are other reasons that we might cause the child's side of + * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless + * the child signalled us first that it is about to call the execve(). It does so by sending us a simple + * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it + * sends a zero byte we'll ignore POLLHUP on the fd again. */ + + for (;;) { + uint8_t x; + ssize_t n; + + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for */ + + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ + log_unit_debug(UNIT(s), "Got EOF on exec-fd"); + + s->exec_fd_hot = false; + + /* Nice! This is what we have been waiting for. Transition to next state. */ + if (s->type == SERVICE_EXEC && s->state == SERVICE_START) + service_enter_start_post(s); + } else + log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring."); + + return 0; + } + + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); + s->exec_fd_hot = x; + } + + return 0; +} + +static void service_notify_cgroup_empty_event(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + log_unit_debug(u, "cgroup is empty"); + + switch (s->state) { + + /* Waiting for SIGCHLD is usually more interesting, + * because it includes return codes/signals. Which is + * why we ignore the cgroup events for most cases, + * except when we don't know pid which to expect the + * SIGCHLD for. */ + + case SERVICE_START: + if (s->type == SERVICE_NOTIFY && + main_pid_good(s) == 0 && + control_pid_good(s) == 0) { + /* No chance of getting a ready notification anymore */ + service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL); + break; + } + + _fallthrough_; + case SERVICE_START_POST: + if (s->pid_file_pathspec && + main_pid_good(s) == 0 && + control_pid_good(s) == 0) { + + /* Give up hoping for the daemon to write its PID file */ + log_unit_warning(u, "Daemon never wrote its PID file. Failing."); + + service_unwatch_pid_file(s); + if (s->state == SERVICE_START) + service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL); + else + service_enter_stop(s, SERVICE_FAILURE_PROTOCOL); + } + break; + + case SERVICE_RUNNING: + /* service_enter_running() will figure out what to do */ + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0) + service_enter_stop_post(s, SERVICE_SUCCESS); + + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0) + service_enter_dead(s, SERVICE_SUCCESS, true); + + break; + + default: + ; + } +} + +static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { + bool notify_dbus = true; + Service *s = SERVICE(u); + ServiceResult f; + + assert(s); + assert(pid >= 0); + + if (is_clean_exit(code, status, s->type == SERVICE_ONESHOT ? EXIT_CLEAN_COMMAND : EXIT_CLEAN_DAEMON, &s->success_status)) + f = SERVICE_SUCCESS; + else if (code == CLD_EXITED) + f = SERVICE_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SERVICE_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SERVICE_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown code"); + + if (s->main_pid == pid) { + /* Forking services may occasionally move to a new PID. + * As long as they update the PID file before exiting the old + * PID, they're fine. */ + if (service_load_pid_file(s, false) > 0) + return; + + s->main_pid = 0; + exec_status_exit(&s->main_exec_status, &s->exec_context, pid, code, status); + + if (s->main_command) { + /* If this is not a forking service than the + * main process got started and hence we copy + * the exit status so that it is recorded both + * as main and as control process exit + * status */ + + s->main_command->exec_status = s->main_exec_status; + + if (s->main_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } else if (s->exec_command[SERVICE_EXEC_START]) { + + /* If this is a forked process, then we should + * ignore the return value if this was + * configured for the starter process */ + + if (s->exec_command[SERVICE_EXEC_START]->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } + + /* When this is a successful exit, let's log about the exit code on DEBUG level. If this is a failure + * and the process exited on its own via exit(), then let's make this a NOTICE, under the assumption + * that the service already logged the reason at a higher log level on its own. (Internally, + * unit_log_process_exit() will possibly bump this to WARNING if the service died due to a signal.) */ + unit_log_process_exit( + u, f == SERVICE_SUCCESS ? LOG_DEBUG : LOG_NOTICE, + "Main process", + service_exec_command_to_string(SERVICE_EXEC_START), + code, status); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->main_command && + s->main_command->command_next && + s->type == SERVICE_ONESHOT && + f == SERVICE_SUCCESS) { + + /* There is another command to * + * execute, so let's do that. */ + + log_unit_debug(u, "Running next main command for state %s.", service_state_to_string(s->state)); + service_run_next_main(s); + + } else { + + /* The service exited, so the service is officially + * gone. */ + s->main_command = NULL; + + switch (s->state) { + + case SERVICE_START_POST: + case SERVICE_RELOAD: + case SERVICE_STOP: + /* Need to wait until the operation is + * done */ + break; + + case SERVICE_START: + if (s->type == SERVICE_ONESHOT) { + /* This was our main goal, so let's go on */ + if (f == SERVICE_SUCCESS) + service_enter_start_post(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } else if (s->type == SERVICE_NOTIFY) { + /* Only enter running through a notification, so that the + * SERVICE_START state signifies that no ready notification + * has been received */ + if (f != SERVICE_SUCCESS) + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + else if (!s->remain_after_exit || s->notify_access == NOTIFY_MAIN) + /* The service has never been and will never be active */ + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL); + break; + } + + _fallthrough_; + case SERVICE_RUNNING: + service_enter_running(s, f); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (control_pid_good(s) <= 0) + service_enter_stop_post(s, f); + + /* If there is still a control process, wait for that first */ + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + + if (control_pid_good(s) <= 0) + service_enter_dead(s, f, true); + break; + + default: + assert_not_reached("Uh, main process died at wrong time."); + } + } + + } else if (s->control_pid == pid) { + s->control_pid = 0; + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } + + unit_log_process_exit( + u, f == SERVICE_SUCCESS ? LOG_DEBUG : LOG_NOTICE, + "Control process", + service_exec_command_to_string(s->control_command_id), + code, status); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->control_command && + s->control_command->command_next && + f == SERVICE_SUCCESS) { + + /* There is another command to * + * execute, so let's do that. */ + + log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state)); + service_run_next_control(s); + + } else { + /* No further commands for this step, so let's + * figure out what to do next */ + + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + log_unit_debug(u, "Got final SIGCHLD for state %s.", service_state_to_string(s->state)); + + switch (s->state) { + + case SERVICE_START_PRE: + if (f == SERVICE_SUCCESS) + service_enter_start(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_START: + if (s->type != SERVICE_FORKING) + /* Maybe spurious event due to a reload that changed the type? */ + break; + + if (f != SERVICE_SUCCESS) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } + + if (s->pid_file) { + bool has_start_post; + int r; + + /* Let's try to load the pid file here if we can. + * The PID file might actually be created by a START_POST + * script. In that case don't worry if the loading fails. */ + + has_start_post = s->exec_command[SERVICE_EXEC_START_POST]; + r = service_load_pid_file(s, !has_start_post); + if (!has_start_post && r < 0) { + r = service_demand_pid_file(s); + if (r < 0 || cgroup_good(s) == 0) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL); + break; + } + } else + service_search_main_pid(s); + + service_enter_start_post(s); + break; + + case SERVICE_START_POST: + if (f != SERVICE_SUCCESS) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } + + if (s->pid_file) { + int r; + + r = service_load_pid_file(s, true); + if (r < 0) { + r = service_demand_pid_file(s); + if (r < 0 || cgroup_good(s) == 0) + service_enter_stop(s, SERVICE_FAILURE_PROTOCOL); + break; + } + } else + service_search_main_pid(s); + + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_RELOAD: + if (f == SERVICE_SUCCESS) + if (service_load_pid_file(s, true) < 0) + service_search_main_pid(s); + + s->reload_result = f; + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP: + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + if (main_pid_good(s) <= 0) + service_enter_stop_post(s, f); + + /* If there is still a service + * process around, wait until + * that one quit, too */ + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0) + service_enter_dead(s, f, true); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + } + } else /* Neither control nor main PID? If so, don't notify about anything */ + notify_dbus = false; + + /* Notify clients about changed exit status */ + if (notify_dbus) + unit_add_to_dbus_queue(u); + + /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to watch, + * under the assumption that we'll sooner or later get a SIGCHLD for them, as the original process we watched + * was probably the parent of them, and they are hence now our children. */ + (void) unit_enqueue_rewatch_pids(u); +} + +static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + assert(source == s->timer_event_source); + + switch (s->state) { + + case SERVICE_START_PRE: + case SERVICE_START: + log_unit_warning(UNIT(s), "%s operation timed out. Terminating.", s->state == SERVICE_START ? "Start" : "Start-pre"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_START_POST: + log_unit_warning(UNIT(s), "Start-post operation timed out. Stopping."); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_RUNNING: + log_unit_warning(UNIT(s), "Service reached runtime time limit. Stopping."); + service_enter_stop(s, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_RELOAD: + log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process."); + service_kill_control_process(s); + s->reload_result = SERVICE_FAILURE_TIMEOUT; + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP: + log_unit_warning(UNIT(s), "Stopping timed out. Terminating."); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_STOP_WATCHDOG: + log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Terminating."); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_STOP_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + + break; + + case SERVICE_STOP_SIGKILL: + /* Uh, we sent a SIGKILL and it is still not gone? + * Must be something we cannot kill, so let's just be + * weirded out and continue */ + + log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_STOP_POST: + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Terminating."); + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_FINAL_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-final-sigterm' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-final-sigterm' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + + break; + + case SERVICE_FINAL_SIGKILL: + log_unit_warning(UNIT(s), "Processes still around after final SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, true); + break; + + case SERVICE_AUTO_RESTART: + if (s->restart_usec > 0) { + char buf_restart[FORMAT_TIMESPAN_MAX]; + log_unit_info(UNIT(s), + "Service RestartSec=%s expired, scheduling restart.", + format_timespan(buf_restart, sizeof buf_restart, s->restart_usec, USEC_PER_SEC)); + } else + log_unit_info(UNIT(s), + "Service has no hold-off time (RestartSec=0), scheduling restart."); + + service_enter_restart(s); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) { + Service *s = SERVICE(userdata); + char t[FORMAT_TIMESPAN_MAX]; + usec_t watchdog_usec; + + assert(s); + assert(source == s->watchdog_event_source); + + watchdog_usec = service_get_watchdog_usec(s); + + if (UNIT(s)->manager->service_watchdogs) { + log_unit_error(UNIT(s), "Watchdog timeout (limit %s)!", + format_timespan(t, sizeof(t), watchdog_usec, 1)); + + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); + } else + log_unit_warning(UNIT(s), "Watchdog disabled! Ignoring watchdog timeout (limit %s)!", + format_timespan(t, sizeof(t), watchdog_usec, 1)); + + return 0; +} + +static bool service_notify_message_authorized(Service *s, pid_t pid, char **tags, FDSet *fds) { + assert(s); + + if (s->notify_access == NOTIFY_NONE) { + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid); + return false; + } + + if (s->notify_access == NOTIFY_MAIN && pid != s->main_pid) { + if (s->main_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid); + else + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid); + + return false; + } + + if (s->notify_access == NOTIFY_EXEC && pid != s->main_pid && pid != s->control_pid) { + if (s->main_pid != 0 && s->control_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT, + pid, s->main_pid, s->control_pid); + else if (s->main_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid); + else if (s->control_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid); + else + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid); + + return false; + } + + return true; +} + +static void service_notify_message( + Unit *u, + const struct ucred *ucred, + char **tags, + FDSet *fds) { + + Service *s = SERVICE(u); + bool notify_dbus = false; + const char *e; + char **i; + int r; + + assert(u); + assert(ucred); + + if (!service_notify_message_authorized(SERVICE(u), ucred->pid, tags, fds)) + return; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *cc = NULL; + + cc = strv_join(tags, ", "); + log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, isempty(cc) ? "n/a" : cc); + } + + /* Interpret MAINPID= */ + e = strv_find_startswith(tags, "MAINPID="); + if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) { + pid_t new_main_pid; + + if (parse_pid(e, &new_main_pid) < 0) + log_unit_warning(u, "Failed to parse MAINPID= field in notification message, ignoring: %s", e); + else if (!s->main_pid_known || new_main_pid != s->main_pid) { + + r = service_is_suitable_main_pid(s, new_main_pid, LOG_WARNING); + if (r == 0) { + /* The new main PID is a bit suspicous, which is OK if the sender is privileged. */ + + if (ucred->uid == 0) { + log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid); + r = 1; + } else + log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid); + } + if (r > 0) { + service_set_main_pid(s, new_main_pid); + + r = unit_watch_pid(UNIT(s), new_main_pid); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to watch new main PID "PID_FMT" for service: %m", new_main_pid); + + notify_dbus = true; + } + } + } + + /* Interpret READY=/STOPPING=/RELOADING=. Last one wins. */ + STRV_FOREACH_BACKWARDS(i, tags) { + + if (streq(*i, "READY=1")) { + s->notify_state = NOTIFY_READY; + + /* Type=notify services inform us about completed + * initialization with READY=1 */ + if (s->type == SERVICE_NOTIFY && s->state == SERVICE_START) + service_enter_start_post(s); + + /* Sending READY=1 while we are reloading informs us + * that the reloading is complete */ + if (s->state == SERVICE_RELOAD && s->control_pid == 0) + service_enter_running(s, SERVICE_SUCCESS); + + notify_dbus = true; + break; + + } else if (streq(*i, "RELOADING=1")) { + s->notify_state = NOTIFY_RELOADING; + + if (s->state == SERVICE_RUNNING) + service_enter_reload_by_notify(s); + + notify_dbus = true; + break; + + } else if (streq(*i, "STOPPING=1")) { + s->notify_state = NOTIFY_STOPPING; + + if (s->state == SERVICE_RUNNING) + service_enter_stop_by_notify(s); + + notify_dbus = true; + break; + } + } + + /* Interpret STATUS= */ + e = strv_find_startswith(tags, "STATUS="); + if (e) { + _cleanup_free_ char *t = NULL; + + if (!isempty(e)) { + /* Note that this size limit check is mostly paranoia: since the datagram size we are willing + * to process is already limited to NOTIFY_BUFFER_MAX, this limit here should never be hit. */ + if (strlen(e) > STATUS_TEXT_MAX) + log_unit_warning(u, "Status message overly long (%zu > %u), ignoring.", strlen(e), STATUS_TEXT_MAX); + else if (!utf8_is_valid(e)) + log_unit_warning(u, "Status message in notification message is not UTF-8 clean, ignoring."); + else { + t = strdup(e); + if (!t) + log_oom(); + } + } + + if (!streq_ptr(s->status_text, t)) { + free_and_replace(s->status_text, t); + notify_dbus = true; + } + } + + /* Interpret ERRNO= */ + e = strv_find_startswith(tags, "ERRNO="); + if (e) { + int status_errno; + + status_errno = parse_errno(e); + if (status_errno < 0) + log_unit_warning_errno(u, status_errno, + "Failed to parse ERRNO= field value '%s' in notification message: %m", e); + else if (s->status_errno != status_errno) { + s->status_errno = status_errno; + notify_dbus = true; + } + } + + /* Interpret EXTEND_TIMEOUT= */ + e = strv_find_startswith(tags, "EXTEND_TIMEOUT_USEC="); + if (e) { + usec_t extend_timeout_usec; + if (safe_atou64(e, &extend_timeout_usec) < 0) + log_unit_warning(u, "Failed to parse EXTEND_TIMEOUT_USEC=%s", e); + else + service_extend_timeout(s, extend_timeout_usec); + } + + /* Interpret WATCHDOG= */ + if (strv_find(tags, "WATCHDOG=1")) + service_reset_watchdog(s); + + e = strv_find_startswith(tags, "WATCHDOG_USEC="); + if (e) { + usec_t watchdog_override_usec; + if (safe_atou64(e, &watchdog_override_usec) < 0) + log_unit_warning(u, "Failed to parse WATCHDOG_USEC=%s", e); + else + service_override_watchdog_timeout(s, watchdog_override_usec); + } + + /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases, + * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing + * fds, but optional when pushing in new fds, for compatibility reasons. */ + if (strv_find(tags, "FDSTOREREMOVE=1")) { + const char *name; + + name = strv_find_startswith(tags, "FDNAME="); + if (!name || !fdname_is_valid(name)) + log_unit_warning(u, "FDSTOREREMOVE=1 requested, but no valid file descriptor name passed, ignoring."); + else + service_remove_fd_store(s, name); + + } else if (strv_find(tags, "FDSTORE=1")) { + const char *name; + + name = strv_find_startswith(tags, "FDNAME="); + if (name && !fdname_is_valid(name)) { + log_unit_warning(u, "Passed FDNAME= name is invalid, ignoring."); + name = NULL; + } + + (void) service_add_fd_store_set(s, fds, name); + } + + /* Notify clients about changed status or main pid */ + if (notify_dbus) + unit_add_to_dbus_queue(u); +} + +static int service_get_timeout(Unit *u, usec_t *timeout) { + Service *s = SERVICE(u); + uint64_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static void service_bus_name_owner_change( + Unit *u, + const char *name, + const char *old_owner, + const char *new_owner) { + + Service *s = SERVICE(u); + int r; + + assert(s); + assert(name); + + assert(streq(s->bus_name, name)); + assert(old_owner || new_owner); + + if (old_owner && new_owner) + log_unit_debug(u, "D-Bus name %s changed owner from %s to %s", name, old_owner, new_owner); + else if (old_owner) + log_unit_debug(u, "D-Bus name %s no longer registered by %s", name, old_owner); + else + log_unit_debug(u, "D-Bus name %s now registered by %s", name, new_owner); + + s->bus_name_good = !!new_owner; + + /* Track the current owner, so we can reconstruct changes after a daemon reload */ + r = free_and_strdup(&s->bus_name_owner, new_owner); + if (r < 0) { + log_unit_error_errno(u, r, "Unable to set new bus name owner %s: %m", new_owner); + return; + } + + if (s->type == SERVICE_DBUS) { + + /* service_enter_running() will figure out what to + * do */ + if (s->state == SERVICE_RUNNING) + service_enter_running(s, SERVICE_SUCCESS); + else if (s->state == SERVICE_START && new_owner) + service_enter_start_post(s); + + } else if (new_owner && + s->main_pid <= 0 && + IN_SET(s->state, + SERVICE_START, + SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD)) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + /* Try to acquire PID from bus service */ + + r = sd_bus_get_name_creds(u->manager->api_bus, name, SD_BUS_CREDS_PID, &creds); + if (r >= 0) + r = sd_bus_creds_get_pid(creds, &pid); + if (r >= 0) { + log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, name, pid); + + service_set_main_pid(s, pid); + unit_watch_pid(UNIT(s), pid); + } + } +} + +int service_set_socket_fd(Service *s, int fd, Socket *sock, bool selinux_context_net) { + _cleanup_free_ char *peer = NULL; + int r; + + assert(s); + assert(fd >= 0); + + /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs + * to be configured. We take ownership of the passed fd on success. */ + + if (UNIT(s)->load_state != UNIT_LOADED) + return -EINVAL; + + if (s->socket_fd >= 0) + return -EBUSY; + + if (s->state != SERVICE_DEAD) + return -EAGAIN; + + if (getpeername_pretty(fd, true, &peer) >= 0) { + + if (UNIT(s)->description) { + _cleanup_free_ char *a; + + a = strjoin(UNIT(s)->description, " (", peer, ")"); + if (!a) + return -ENOMEM; + + r = unit_set_description(UNIT(s), a); + } else + r = unit_set_description(UNIT(s), peer); + + if (r < 0) + return r; + } + + r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + + s->socket_fd = fd; + s->socket_fd_selinux_context_net = selinux_context_net; + + unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock)); + return 0; +} + +static void service_reset_failed(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (s->state == SERVICE_FAILED) + service_set_state(s, SERVICE_DEAD); + + s->result = SERVICE_SUCCESS; + s->reload_result = SERVICE_SUCCESS; + s->n_restarts = 0; + s->flush_n_restarts = false; +} + +static int service_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + Service *s = SERVICE(u); + + assert(s); + + return unit_kill_common(u, who, signo, s->main_pid, s->control_pid, error); +} + +static int service_main_pid(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return s->main_pid; +} + +static int service_control_pid(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return s->control_pid; +} + +static bool service_needs_console(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* We provide our own implementation of this here, instead of relying of the generic implementation + * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */ + + if (!exec_context_may_touch_console(&s->exec_context)) + return false; + + return IN_SET(s->state, + SERVICE_START_PRE, + SERVICE_START, + SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_STOP, + SERVICE_STOP_WATCHDOG, + SERVICE_STOP_SIGTERM, + SERVICE_STOP_SIGKILL, + SERVICE_STOP_POST, + SERVICE_FINAL_SIGTERM, + SERVICE_FINAL_SIGKILL); +} + +static int service_exit_status(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + if (s->main_exec_status.pid <= 0 || + !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) + return -ENODATA; + + if (s->main_exec_status.code != CLD_EXITED) + return -EBADE; + + return s->main_exec_status.status; +} + +static const char* const service_restart_table[_SERVICE_RESTART_MAX] = { + [SERVICE_RESTART_NO] = "no", + [SERVICE_RESTART_ON_SUCCESS] = "on-success", + [SERVICE_RESTART_ON_FAILURE] = "on-failure", + [SERVICE_RESTART_ON_ABNORMAL] = "on-abnormal", + [SERVICE_RESTART_ON_WATCHDOG] = "on-watchdog", + [SERVICE_RESTART_ON_ABORT] = "on-abort", + [SERVICE_RESTART_ALWAYS] = "always", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart); + +static const char* const service_type_table[_SERVICE_TYPE_MAX] = { + [SERVICE_SIMPLE] = "simple", + [SERVICE_FORKING] = "forking", + [SERVICE_ONESHOT] = "oneshot", + [SERVICE_DBUS] = "dbus", + [SERVICE_NOTIFY] = "notify", + [SERVICE_IDLE] = "idle", + [SERVICE_EXEC] = "exec", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType); + +static const char* const service_exec_command_table[_SERVICE_EXEC_COMMAND_MAX] = { + [SERVICE_EXEC_START_PRE] = "ExecStartPre", + [SERVICE_EXEC_START] = "ExecStart", + [SERVICE_EXEC_START_POST] = "ExecStartPost", + [SERVICE_EXEC_RELOAD] = "ExecReload", + [SERVICE_EXEC_STOP] = "ExecStop", + [SERVICE_EXEC_STOP_POST] = "ExecStopPost", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_exec_command, ServiceExecCommand); + +static const char* const notify_state_table[_NOTIFY_STATE_MAX] = { + [NOTIFY_UNKNOWN] = "unknown", + [NOTIFY_READY] = "ready", + [NOTIFY_RELOADING] = "reloading", + [NOTIFY_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP(notify_state, NotifyState); + +static const char* const service_result_table[_SERVICE_RESULT_MAX] = { + [SERVICE_SUCCESS] = "success", + [SERVICE_FAILURE_RESOURCES] = "resources", + [SERVICE_FAILURE_PROTOCOL] = "protocol", + [SERVICE_FAILURE_TIMEOUT] = "timeout", + [SERVICE_FAILURE_EXIT_CODE] = "exit-code", + [SERVICE_FAILURE_SIGNAL] = "signal", + [SERVICE_FAILURE_CORE_DUMP] = "core-dump", + [SERVICE_FAILURE_WATCHDOG] = "watchdog", + [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult); + +const UnitVTable service_vtable = { + .object_size = sizeof(Service), + .exec_context_offset = offsetof(Service, exec_context), + .cgroup_context_offset = offsetof(Service, cgroup_context), + .kill_context_offset = offsetof(Service, kill_context), + .exec_runtime_offset = offsetof(Service, exec_runtime), + .dynamic_creds_offset = offsetof(Service, dynamic_creds), + + .sections = + "Unit\0" + "Service\0" + "Install\0", + .private_section = "Service", + + .can_transient = true, + .can_delegate = true, + + .init = service_init, + .done = service_done, + .load = service_load, + .release_resources = service_release_resources, + + .coldplug = service_coldplug, + + .dump = service_dump, + + .start = service_start, + .stop = service_stop, + .reload = service_reload, + + .can_reload = service_can_reload, + + .kill = service_kill, + + .serialize = service_serialize, + .deserialize_item = service_deserialize_item, + + .active_state = service_active_state, + .sub_state_to_string = service_sub_state_to_string, + + .will_restart = service_will_restart, + + .may_gc = service_may_gc, + + .sigchld_event = service_sigchld_event, + + .reset_failed = service_reset_failed, + + .notify_cgroup_empty = service_notify_cgroup_empty_event, + .notify_message = service_notify_message, + + .main_pid = service_main_pid, + .control_pid = service_control_pid, + + .bus_name_owner_change = service_bus_name_owner_change, + + .bus_vtable = bus_service_vtable, + .bus_set_property = bus_service_set_property, + .bus_commit_properties = bus_service_commit_properties, + + .get_timeout = service_get_timeout, + .needs_console = service_needs_console, + .exit_status = service_exit_status, + + .status_message_formats = { + .starting_stopping = { + [0] = "Starting %s...", + [1] = "Stopping %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Started %s.", + [JOB_FAILED] = "Failed to start %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Stopped %s.", + [JOB_FAILED] = "Stopped (with error) %s.", + }, + }, +}; diff --git a/src/core/service.h b/src/core/service.h new file mode 100644 index 0000000..9c4340c --- /dev/null +++ b/src/core/service.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Service Service; +typedef struct ServiceFDStore ServiceFDStore; + +#include "exit-status.h" +#include "kill.h" +#include "path.h" +#include "ratelimit.h" +#include "socket.h" +#include "unit.h" + +typedef enum ServiceRestart { + SERVICE_RESTART_NO, + SERVICE_RESTART_ON_SUCCESS, + SERVICE_RESTART_ON_FAILURE, + SERVICE_RESTART_ON_ABNORMAL, + SERVICE_RESTART_ON_WATCHDOG, + SERVICE_RESTART_ON_ABORT, + SERVICE_RESTART_ALWAYS, + _SERVICE_RESTART_MAX, + _SERVICE_RESTART_INVALID = -1 +} ServiceRestart; + +typedef enum ServiceType { + SERVICE_SIMPLE, /* we fork and go on right-away (i.e. modern socket activated daemons) */ + SERVICE_FORKING, /* forks by itself (i.e. traditional daemons) */ + SERVICE_ONESHOT, /* we fork and wait until the program finishes (i.e. programs like fsck which run and need to finish before we continue) */ + SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */ + SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */ + SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */ + SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */ + _SERVICE_TYPE_MAX, + _SERVICE_TYPE_INVALID = -1 +} ServiceType; + +typedef enum ServiceExecCommand { + SERVICE_EXEC_START_PRE, + SERVICE_EXEC_START, + SERVICE_EXEC_START_POST, + SERVICE_EXEC_RELOAD, + SERVICE_EXEC_STOP, + SERVICE_EXEC_STOP_POST, + _SERVICE_EXEC_COMMAND_MAX, + _SERVICE_EXEC_COMMAND_INVALID = -1 +} ServiceExecCommand; + +typedef enum NotifyState { + NOTIFY_UNKNOWN, + NOTIFY_READY, + NOTIFY_RELOADING, + NOTIFY_STOPPING, + _NOTIFY_STATE_MAX, + _NOTIFY_STATE_INVALID = -1 +} NotifyState; + +/* The values of this enum are referenced in man/systemd.exec.xml and src/shared/bus-unit-util.c. + * Update those sources for each change to this enum. */ +typedef enum ServiceResult { + SERVICE_SUCCESS, + SERVICE_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */ + SERVICE_FAILURE_PROTOCOL, + SERVICE_FAILURE_TIMEOUT, + SERVICE_FAILURE_EXIT_CODE, + SERVICE_FAILURE_SIGNAL, + SERVICE_FAILURE_CORE_DUMP, + SERVICE_FAILURE_WATCHDOG, + SERVICE_FAILURE_START_LIMIT_HIT, + _SERVICE_RESULT_MAX, + _SERVICE_RESULT_INVALID = -1 +} ServiceResult; + +struct ServiceFDStore { + Service *service; + + int fd; + char *fdname; + sd_event_source *event_source; + + LIST_FIELDS(ServiceFDStore, fd_store); +}; + +struct Service { + Unit meta; + + ServiceType type; + ServiceRestart restart; + ExitStatusSet restart_prevent_status; + ExitStatusSet restart_force_status; + ExitStatusSet success_status; + + /* If set we'll read the main daemon PID from this file */ + char *pid_file; + + usec_t restart_usec; + usec_t timeout_start_usec; + usec_t timeout_stop_usec; + usec_t runtime_max_usec; + + dual_timestamp watchdog_timestamp; + usec_t watchdog_usec; /* the requested watchdog timeout in the unit file */ + usec_t watchdog_original_usec; /* the watchdog timeout that was in effect when the unit was started, i.e. the timeout the forked off processes currently see */ + usec_t watchdog_override_usec; /* the watchdog timeout requested by the service itself through sd_notify() */ + bool watchdog_override_enable; + sd_event_source *watchdog_event_source; + + ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX]; + + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ServiceState state, deserialized_state; + + /* The exit status of the real main process */ + ExecStatus main_exec_status; + + /* The currently executed control process */ + ExecCommand *control_command; + + /* The currently executed main process, which may be NULL if + * the main process got started via forking mode and not by + * us */ + ExecCommand *main_command; + + /* The ID of the control command currently being executed */ + ServiceExecCommand control_command_id; + + /* Runtime data of the execution context */ + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + pid_t main_pid, control_pid; + int socket_fd; + SocketPeer *peer; + bool socket_fd_selinux_context_net; + + bool permissions_start_only; + bool root_directory_start_only; + bool remain_after_exit; + bool guess_main_pid; + + /* If we shut down, remember why */ + ServiceResult result; + ServiceResult reload_result; + + bool main_pid_known:1; + bool main_pid_alien:1; + bool bus_name_good:1; + bool forbid_restart:1; + /* Keep restart intention between UNIT_FAILED and UNIT_ACTIVATING */ + bool will_auto_restart:1; + bool start_timeout_defined:1; + + char *bus_name; + char *bus_name_owner; /* unique name of the current owner */ + + char *status_text; + int status_errno; + + UnitRef accept_socket; + + sd_event_source *timer_event_source; + PathSpec *pid_file_pathspec; + + NotifyAccess notify_access; + NotifyState notify_state; + + sd_event_source *exec_fd_event_source; + + ServiceFDStore *fd_store; + size_t n_fd_store; + unsigned n_fd_store_max; + unsigned n_keep_fd_store; + + char *usb_function_descriptors; + char *usb_function_strings; + + int stdin_fd; + int stdout_fd; + int stderr_fd; + + unsigned n_restarts; + bool flush_n_restarts; + bool exec_fd_hot; +}; + +extern const UnitVTable service_vtable; + +int service_set_socket_fd(Service *s, int fd, struct Socket *socket, bool selinux_context_net); +void service_close_socket_fd(Service *s); + +const char* service_restart_to_string(ServiceRestart i) _const_; +ServiceRestart service_restart_from_string(const char *s) _pure_; + +const char* service_type_to_string(ServiceType i) _const_; +ServiceType service_type_from_string(const char *s) _pure_; + +const char* service_exec_command_to_string(ServiceExecCommand i) _const_; +ServiceExecCommand service_exec_command_from_string(const char *s) _pure_; + +const char* notify_state_to_string(NotifyState i) _const_; +NotifyState notify_state_from_string(const char *s) _pure_; + +const char* service_result_to_string(ServiceResult i) _const_; +ServiceResult service_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SERVICE, Service); + +#define STATUS_TEXT_MAX (16U*1024U) diff --git a/src/core/show-status.c b/src/core/show-status.c new file mode 100644 index 0000000..f748a82 --- /dev/null +++ b/src/core/show-status.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "parse-util.h" +#include "show-status.h" +#include "string-table.h" +#include "string-util.h" +#include "terminal-util.h" +#include "util.h" + +static const char* const show_status_table[_SHOW_STATUS_MAX] = { + [SHOW_STATUS_NO] = "no", + [SHOW_STATUS_AUTO] = "auto", + [SHOW_STATUS_TEMPORARY] = "temporary", + [SHOW_STATUS_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(show_status, ShowStatus, SHOW_STATUS_YES); + +int parse_show_status(const char *v, ShowStatus *ret) { + ShowStatus s; + + assert(ret); + + s = show_status_from_string(v); + if (s < 0 || s == SHOW_STATUS_TEMPORARY) + return -EINVAL; + + *ret = s; + return 0; +} + +int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) { + static const char status_indent[] = " "; /* "[" STATUS "] " */ + _cleanup_free_ char *s = NULL; + _cleanup_close_ int fd = -1; + struct iovec iovec[7] = {}; + int n = 0; + static bool prev_ephemeral; + + assert(format); + + /* This is independent of logging, as status messages are + * optional and go exclusively to the console. */ + + if (vasprintf(&s, format, ap) < 0) + return log_oom(); + + /* Before you ask: yes, on purpose we open/close the console for each status line we write individually. This + * is a good strategy to avoid PID 1 getting killed by the kernel's SAK concept (it doesn't fix this entirely, + * but minimizes the time window the kernel might end up killing PID 1 due to SAK). It also makes things easier + * for us so that we don't have to recover from hangups and suchlike triggered on the console. */ + + fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return fd; + + if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) { + char *e; + size_t emax, sl; + int c; + + c = fd_columns(fd); + if (c <= 0) + c = 80; + + sl = status ? sizeof(status_indent)-1 : 0; + + emax = c - sl - 1; + if (emax < 3) + emax = 3; + + e = ellipsize(s, emax, 50); + if (e) + free_and_replace(s, e); + } + + if (prev_ephemeral) + iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE); + + if (status) { + if (!isempty(status)) { + iovec[n++] = IOVEC_MAKE_STRING("["); + iovec[n++] = IOVEC_MAKE_STRING(status); + iovec[n++] = IOVEC_MAKE_STRING("] "); + } else + iovec[n++] = IOVEC_MAKE_STRING(status_indent); + } + + iovec[n++] = IOVEC_MAKE_STRING(s); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL)) + iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE); + prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL) ; + + if (writev(fd, iovec, n) < 0) + return -errno; + + return 0; +} + +int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) { + va_list ap; + int r; + + assert(format); + + va_start(ap, format); + r = status_vprintf(status, flags, format, ap); + va_end(ap); + + return r; +} diff --git a/src/core/show-status.h b/src/core/show-status.h new file mode 100644 index 0000000..f574d92 --- /dev/null +++ b/src/core/show-status.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> + +#include "macro.h" + +/* Manager status */ + +typedef enum ShowStatus { + SHOW_STATUS_NO, + SHOW_STATUS_AUTO, + SHOW_STATUS_TEMPORARY, + SHOW_STATUS_YES, + _SHOW_STATUS_MAX, + _SHOW_STATUS_INVALID = -1, +} ShowStatus; + +typedef enum ShowStatusFlags { + SHOW_STATUS_ELLIPSIZE = 1 << 0, + SHOW_STATUS_EPHEMERAL = 1 << 1, +} ShowStatusFlags; + +ShowStatus show_status_from_string(const char *v) _const_; +const char* show_status_to_string(ShowStatus s) _pure_; +int parse_show_status(const char *v, ShowStatus *ret); + +int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) _printf_(3,0); +int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) _printf_(3,4); diff --git a/src/core/shutdown.c b/src/core/shutdown.c new file mode 100644 index 0000000..cb47ee8 --- /dev/null +++ b/src/core/shutdown.c @@ -0,0 +1,545 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include <errno.h> +#include <getopt.h> +#include <linux/reboot.h> +#include <signal.h> +#include <stdbool.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/reboot.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "async.h" +#include "cgroup-util.h" +#include "def.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "killall.h" +#include "log.h" +#include "missing.h" +#include "parse-util.h" +#include "process-util.h" +#include "reboot-util.h" +#include "rlimit-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "switch-root.h" +#include "terminal-util.h" +#include "umount.h" +#include "util.h" +#include "virt.h" +#include "watchdog.h" + +#define SYNC_PROGRESS_ATTEMPTS 3 +#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC) + +static char* arg_verb; +static uint8_t arg_exit_code; +static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC; + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_LOG_LEVEL = 0x100, + ARG_LOG_TARGET, + ARG_LOG_COLOR, + ARG_LOG_LOCATION, + ARG_EXIT_CODE, + ARG_TIMEOUT, + }; + + static const struct option options[] = { + { "log-level", required_argument, NULL, ARG_LOG_LEVEL }, + { "log-target", required_argument, NULL, ARG_LOG_TARGET }, + { "log-color", optional_argument, NULL, ARG_LOG_COLOR }, + { "log-location", optional_argument, NULL, ARG_LOG_LOCATION }, + { "exit-code", required_argument, NULL, ARG_EXIT_CODE }, + { "timeout", required_argument, NULL, ARG_TIMEOUT }, + {} + }; + + int c, r; + + assert(argc >= 1); + assert(argv); + + /* "-" prevents getopt from permuting argv[] and moving the verb away + * from argv[1]. Our interface to initrd promises it'll be there. */ + while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0) + switch (c) { + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log level %s, ignoring: %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log target %s, ignoring: %m", optarg); + + break; + + case ARG_LOG_COLOR: + + if (optarg) { + r = log_show_color_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log color setting %s, ignoring: %m", optarg); + } else + log_show_color(true); + + break; + + case ARG_LOG_LOCATION: + if (optarg) { + r = log_show_location_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log location setting %s, ignoring: %m", optarg); + } else + log_show_location(true); + + break; + + case ARG_EXIT_CODE: + r = safe_atou8(optarg, &arg_exit_code); + if (r < 0) + log_error_errno(r, "Failed to parse exit code %s, ignoring: %m", optarg); + + break; + + case ARG_TIMEOUT: + r = parse_sec(optarg, &arg_timeout); + if (r < 0) + log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring: %m", optarg); + + break; + + case '\001': + if (!arg_verb) + arg_verb = optarg; + else + log_error("Excess arguments, ignoring"); + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached("Unhandled option code."); + } + + if (!arg_verb) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Verb argument missing."); + + return 0; +} + +static int switch_root_initramfs(void) { + if (mount("/run/initramfs", "/run/initramfs", NULL, MS_BIND, NULL) < 0) + return log_error_errno(errno, "Failed to mount bind /run/initramfs on /run/initramfs: %m"); + + if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0) + return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m"); + + /* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors. + * /run/initramfs/shutdown will take care of these. + * Also do not detach the old root, because /run/initramfs/shutdown needs to access it. + */ + return switch_root("/run/initramfs", "/oldroot", false, MS_BIND); +} + +/* Read the following fields from /proc/meminfo: + * + * NFS_Unstable + * Writeback + * Dirty + * + * Return true if the sum of these fields is greater than the previous + * value input. For all other issues, report the failure and indicate that + * the sync is not making progress. + */ +static bool sync_making_progress(unsigned long long *prev_dirty) { + _cleanup_fclose_ FILE *f = NULL; + unsigned long long val = 0; + bool r = false; + + f = fopen("/proc/meminfo", "re"); + if (!f) + return log_warning_errno(errno, "Failed to open /proc/meminfo: %m"); + + for (;;) { + _cleanup_free_ char *line = NULL; + unsigned long long ull = 0; + int q; + + q = read_line(f, LONG_LINE_MAX, &line); + if (q < 0) + return log_warning_errno(q, "Failed to parse /proc/meminfo: %m"); + if (q == 0) + break; + + if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:")) + continue; + + errno = 0; + if (sscanf(line, "%*s %llu %*s", &ull) != 1) { + if (errno != 0) + log_warning_errno(errno, "Failed to parse /proc/meminfo: %m"); + else + log_warning("Failed to parse /proc/meminfo"); + + return false; + } + + val += ull; + } + + r = *prev_dirty > val; + + *prev_dirty = val; + + return r; +} + +static void sync_with_progress(void) { + unsigned long long dirty = ULONG_LONG_MAX; + unsigned checks; + pid_t pid; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + /* Due to the possiblity of the sync operation hanging, we fork a child process and monitor the progress. If + * the timeout lapses, the assumption is that that particular sync stalled. */ + + r = asynchronous_sync(&pid); + if (r < 0) { + log_error_errno(r, "Failed to fork sync(): %m"); + return; + } + + log_info("Syncing filesystems and block devices."); + + /* Start monitoring the sync operation. If more than + * SYNC_PROGRESS_ATTEMPTS lapse without progress being made, + * we assume that the sync is stalled */ + for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) { + r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC); + if (r == 0) + /* Sync finished without error. + * (The sync itself does not return an error code) */ + return; + else if (r == -ETIMEDOUT) { + /* Reset the check counter if the "Dirty" value is + * decreasing */ + if (sync_making_progress(&dirty)) + checks = 0; + } else { + log_error_errno(r, "Failed to sync filesystems and block devices: %m"); + return; + } + } + + /* Only reached in the event of a timeout. We should issue a kill + * to the stray process. */ + log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid); + (void) kill(pid, SIGKILL); +} + +int main(int argc, char *argv[]) { + bool need_umount, need_swapoff, need_loop_detach, need_dm_detach; + bool in_container, use_watchdog = false, can_initrd; + _cleanup_free_ char *cgroup = NULL; + char *arguments[3]; + int cmd, r, umount_log_level = LOG_INFO; + static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL}; + char *watchdog_device; + + /* The log target defaults to console, but the original systemd process will pass its log target in through a + * command line argument, which will override this default. Also, ensure we'll never log to the journal or + * syslog, as these logging daemons are either already dead or will die very soon. */ + + log_set_target(LOG_TARGET_CONSOLE); + log_set_prohibit_ipc(true); + log_parse_environment(); + + r = parse_argv(argc, argv); + if (r < 0) + goto error; + + log_open(); + + umask(0022); + + if (getpid_cached() != 1) { + log_error("Not executed by init (PID 1)."); + r = -EPERM; + goto error; + } + + if (streq(arg_verb, "reboot")) + cmd = RB_AUTOBOOT; + else if (streq(arg_verb, "poweroff")) + cmd = RB_POWER_OFF; + else if (streq(arg_verb, "halt")) + cmd = RB_HALT_SYSTEM; + else if (streq(arg_verb, "kexec")) + cmd = LINUX_REBOOT_CMD_KEXEC; + else if (streq(arg_verb, "exit")) + cmd = 0; /* ignored, just checking that arg_verb is valid */ + else { + log_error("Unknown action '%s'.", arg_verb); + r = -EINVAL; + goto error; + } + + (void) cg_get_root_path(&cgroup); + in_container = detect_container() > 0; + + use_watchdog = getenv("WATCHDOG_USEC"); + watchdog_device = getenv("WATCHDOG_DEVICE"); + if (watchdog_device) { + r = watchdog_set_device(watchdog_device); + if (r < 0) + log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", + watchdog_device); + } + + /* Lock us into memory */ + (void) mlockall(MCL_CURRENT|MCL_FUTURE); + + /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that + * slow IO is processed here already and the final process killing spree is not impacted by processes + * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will + * result. */ + if (!in_container) + sync_with_progress(); + + disable_coredumps(); + + log_info("Sending SIGTERM to remaining processes..."); + broadcast_signal(SIGTERM, true, true, arg_timeout); + + log_info("Sending SIGKILL to remaining processes..."); + broadcast_signal(SIGKILL, true, false, arg_timeout); + + need_umount = !in_container; + need_swapoff = !in_container; + need_loop_detach = !in_container; + need_dm_detach = !in_container; + can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0; + + /* Unmount all mountpoints, swaps, and loopback devices */ + for (;;) { + bool changed = false; + + if (use_watchdog) + watchdog_ping(); + + /* Let's trim the cgroup tree on each iteration so + that we leave an empty cgroup tree around, so that + container managers get a nice notify event when we + are down */ + if (cgroup) + cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false); + + if (need_umount) { + log_info("Unmounting file systems."); + r = umount_all(&changed, umount_log_level); + if (r == 0) { + need_umount = false; + log_info("All filesystems unmounted."); + } else if (r > 0) + log_info("Not all file systems unmounted, %d left.", r); + else + log_error_errno(r, "Failed to unmount file systems: %m"); + } + + if (need_swapoff) { + log_info("Deactivating swaps."); + r = swapoff_all(&changed); + if (r == 0) { + need_swapoff = false; + log_info("All swaps deactivated."); + } else if (r > 0) + log_info("Not all swaps deactivated, %d left.", r); + else + log_error_errno(r, "Failed to deactivate swaps: %m"); + } + + if (need_loop_detach) { + log_info("Detaching loop devices."); + r = loopback_detach_all(&changed, umount_log_level); + if (r == 0) { + need_loop_detach = false; + log_info("All loop devices detached."); + } else if (r > 0) + log_info("Not all loop devices detached, %d left.", r); + else + log_error_errno(r, "Failed to detach loop devices: %m"); + } + + if (need_dm_detach) { + log_info("Detaching DM devices."); + r = dm_detach_all(&changed, umount_log_level); + if (r == 0) { + need_dm_detach = false; + log_info("All DM devices detached."); + } else if (r > 0) + log_info("Not all DM devices detached, %d left.", r); + else + log_error_errno(r, "Failed to detach DM devices: %m"); + } + + if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach) { + log_info("All filesystems, swaps, loop devices and DM devices detached."); + /* Yay, done */ + break; + } + + if (!changed && umount_log_level == LOG_INFO && !can_initrd) { + /* There are things we cannot get rid of. Loop one more time + * with LOG_ERR to inform the user. Note that we don't need + * to do this if there is a initrd to switch to, because that + * one is likely to get rid of the remounting mounts. If not, + * it will log about them. */ + umount_log_level = LOG_ERR; + continue; + } + + /* If in this iteration we didn't manage to + * unmount/deactivate anything, we simply give up */ + if (!changed) { + log_info("Cannot finalize remaining%s%s%s%s continuing.", + need_umount ? " file systems," : "", + need_swapoff ? " swap devices," : "", + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : ""); + break; + } + + log_debug("Couldn't finalize remaining %s%s%s%s trying again.", + need_umount ? " file systems," : "", + need_swapoff ? " swap devices," : "", + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : ""); + } + + /* We're done with the watchdog. */ + watchdog_free_device(); + + arguments[0] = NULL; + arguments[1] = arg_verb; + arguments[2] = NULL; + execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments, NULL); + + (void) rlimit_nofile_safe(); + + if (can_initrd) { + r = switch_root_initramfs(); + if (r >= 0) { + argv[0] = (char*) "/shutdown"; + + (void) setsid(); + (void) make_console_stdio(); + + log_info("Successfully changed into root pivot.\n" + "Returning to initrd..."); + + execv("/shutdown", argv); + log_error_errno(errno, "Failed to execute shutdown binary: %m"); + } else + log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m"); + + } + + if (need_umount || need_swapoff || need_loop_detach || need_dm_detach) + log_error("Failed to finalize %s%s%s%s ignoring", + need_umount ? " file systems," : "", + need_swapoff ? " swap devices," : "", + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : ""); + + /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be + * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we + * sync'ed things already once above, but we did some more work since then which might have caused IO, hence + * let's do it once more. Do not remove this sync, data corruption will result. */ + if (!in_container) + sync_with_progress(); + + if (streq(arg_verb, "exit")) { + if (in_container) + return arg_exit_code; + + cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */ + } + + switch (cmd) { + + case LINUX_REBOOT_CMD_KEXEC: + + if (!in_container) { + /* We cheat and exec kexec to avoid doing all its work */ + log_info("Rebooting with kexec."); + + r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL); + if (r == 0) { + const char * const args[] = { + KEXEC, "-e", NULL + }; + + /* Child */ + + execv(args[0], (char * const *) args); + _exit(EXIT_FAILURE); + } + + /* If we are still running, then the kexec can't have worked, let's fall through */ + } + + cmd = RB_AUTOBOOT; + _fallthrough_; + + case RB_AUTOBOOT: + (void) reboot_with_parameter(REBOOT_LOG); + log_info("Rebooting."); + break; + + case RB_POWER_OFF: + log_info("Powering off."); + break; + + case RB_HALT_SYSTEM: + log_info("Halting system."); + break; + + default: + assert_not_reached("Unknown magic"); + } + + (void) reboot(cmd); + if (errno == EPERM && in_container) { + /* If we are in a container, and we lacked + * CAP_SYS_BOOT just exit, this will kill our + * container for good. */ + log_info("Exiting container."); + return EXIT_SUCCESS; + } + + r = log_error_errno(errno, "Failed to invoke reboot(): %m"); + + error: + log_emergency_errno(r, "Critical error while doing system shutdown: %m"); + freeze(); +} diff --git a/src/core/slice.c b/src/core/slice.c new file mode 100644 index 0000000..15b18bc --- /dev/null +++ b/src/core/slice.c @@ -0,0 +1,397 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> + +#include "alloc-util.h" +#include "dbus-slice.h" +#include "dbus-unit.h" +#include "log.h" +#include "serialize.h" +#include "slice.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = { + [SLICE_DEAD] = UNIT_INACTIVE, + [SLICE_ACTIVE] = UNIT_ACTIVE +}; + +static void slice_init(Unit *u) { + assert(u); + assert(u->load_state == UNIT_STUB); + + u->ignore_on_isolate = true; +} + +static void slice_set_state(Slice *t, SliceState state) { + SliceState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != old_state) + log_debug("%s changed %s -> %s", + UNIT(t)->id, + slice_state_to_string(old_state), + slice_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int slice_add_parent_slice(Slice *s) { + Unit *u = UNIT(s), *parent; + _cleanup_free_ char *a = NULL; + int r; + + assert(s); + + if (UNIT_ISSET(u->slice)) + return 0; + + r = slice_build_parent_slice(u->id, &a); + if (r <= 0) /* 0 means root slice */ + return r; + + r = manager_load_unit(u->manager, a, NULL, NULL, &parent); + if (r < 0) + return r; + + unit_ref_set(&u->slice, u, parent); + return 0; +} + +static int slice_add_default_dependencies(Slice *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Make sure slices are unloaded on shutdown */ + r = unit_add_two_dependencies_by_name( + UNIT(s), + UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int slice_verify(Slice *s) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(s); + + if (UNIT(s)->load_state != UNIT_LOADED) + return 0; + + if (!slice_name_is_valid(UNIT(s)->id)) { + log_unit_error(UNIT(s), "Slice name %s is not valid. Refusing.", UNIT(s)->id); + return -ENOEXEC; + } + + r = slice_build_parent_slice(UNIT(s)->id, &parent); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to determine parent slice: %m"); + + if (parent ? !unit_has_name(UNIT_DEREF(UNIT(s)->slice), parent) : UNIT_ISSET(UNIT(s)->slice)) { + log_unit_error(UNIT(s), "Located outside of parent slice. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int slice_load_root_slice(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + u->perpetual = true; + + /* The root slice is a bit special. For example it is always running and cannot be terminated. Because of its + * special semantics we synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + if (!u->description) + u->description = strdup("Root Slice"); + if (!u->documentation) + u->documentation = strv_new("man:systemd.special(7)"); + + return 1; +} + +static int slice_load_system_slice(Unit *u) { + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return 0; + if (!unit_has_name(u, SPECIAL_SYSTEM_SLICE)) + return 0; + + u->perpetual = true; + + /* The system slice is a bit special. For example it is always running and cannot be terminated. Because of its + * special semantics we synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + if (!u->description) + u->description = strdup("System Slice"); + if (!u->documentation) + u->documentation = strv_new("man:systemd.special(7)"); + + return 1; +} + +static int slice_load(Unit *u) { + Slice *s = SLICE(u); + int r; + + assert(s); + assert(u->load_state == UNIT_STUB); + + r = slice_load_root_slice(u); + if (r < 0) + return r; + r = slice_load_system_slice(u); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin_optional(u); + if (r < 0) + return r; + + /* This is a new unit? Then let's add in some extras */ + if (u->load_state == UNIT_LOADED) { + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = slice_add_parent_slice(s); + if (r < 0) + return r; + + r = slice_add_default_dependencies(s); + if (r < 0) + return r; + } + + return slice_verify(s); +} + +static int slice_coldplug(Unit *u) { + Slice *t = SLICE(u); + + assert(t); + assert(t->state == SLICE_DEAD); + + if (t->deserialized_state != t->state) + slice_set_state(t, t->deserialized_state); + + return 0; +} + +static void slice_dump(Unit *u, FILE *f, const char *prefix) { + Slice *t = SLICE(u); + + assert(t); + assert(f); + + fprintf(f, + "%sSlice State: %s\n", + prefix, slice_state_to_string(t->state)); + + cgroup_context_dump(&t->cgroup_context, f, prefix); +} + +static int slice_start(Unit *u) { + Slice *t = SLICE(u); + int r; + + assert(t); + assert(t->state == SLICE_DEAD); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + (void) unit_realize_cgroup(u); + (void) unit_reset_cpu_accounting(u); + (void) unit_reset_ip_accounting(u); + + slice_set_state(t, SLICE_ACTIVE); + return 1; +} + +static int slice_stop(Unit *u) { + Slice *t = SLICE(u); + + assert(t); + assert(t->state == SLICE_ACTIVE); + + /* We do not need to destroy the cgroup explicitly, + * unit_notify() will do that for us anyway. */ + + slice_set_state(t, SLICE_DEAD); + return 1; +} + +static int slice_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, -1, error); +} + +static int slice_serialize(Unit *u, FILE *f, FDSet *fds) { + Slice *s = SLICE(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", slice_state_to_string(s->state)); + + return 0; +} + +static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Slice *s = SLICE(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + SliceState state; + + state = slice_state_from_string(value); + if (state < 0) + log_debug("Failed to parse state value %s", value); + else + s->deserialized_state = state; + + } else + log_debug("Unknown serialization key '%s'", key); + + return 0; +} + +_pure_ static UnitActiveState slice_active_state(Unit *u) { + assert(u); + + return state_translation_table[SLICE(u)->state]; +} + +_pure_ static const char *slice_sub_state_to_string(Unit *u) { + assert(u); + + return slice_state_to_string(SLICE(u)->state); +} + +static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) { + Unit *u; + int r; + + assert(m); + assert(name); + + u = manager_get_unit(m, name); + if (!u) { + r = unit_new_for_name(m, sizeof(Slice), name, &u); + if (r < 0) + return log_error_errno(r, "Failed to allocate the special %s unit: %m", name); + } + + u->perpetual = true; + SLICE(u)->deserialized_state = SLICE_ACTIVE; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); + + if (ret) + *ret = u; + + return 0; +} + +static void slice_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u); + if (r >= 0 && manager_owns_host_root_cgroup(m)) { + Slice *s = SLICE(u); + + /* If we are managing the root cgroup then this means our root slice covers the whole system, which + * means the kernel will track CPU/tasks/memory for us anyway, and it is all available in /proc. Let's + * hence turn accounting on here, so that our APIs to query this data are available. */ + + s->cgroup_context.cpu_accounting = true; + s->cgroup_context.tasks_accounting = true; + s->cgroup_context.memory_accounting = true; + } + + if (MANAGER_IS_SYSTEM(m)) + (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL); +} + +const UnitVTable slice_vtable = { + .object_size = sizeof(Slice), + .cgroup_context_offset = offsetof(Slice, cgroup_context), + + .sections = + "Unit\0" + "Slice\0" + "Install\0", + .private_section = "Slice", + + .can_transient = true, + + .init = slice_init, + .load = slice_load, + + .coldplug = slice_coldplug, + + .dump = slice_dump, + + .start = slice_start, + .stop = slice_stop, + + .kill = slice_kill, + + .serialize = slice_serialize, + .deserialize_item = slice_deserialize_item, + + .active_state = slice_active_state, + .sub_state_to_string = slice_sub_state_to_string, + + .bus_vtable = bus_slice_vtable, + .bus_set_property = bus_slice_set_property, + .bus_commit_properties = bus_slice_commit_properties, + + .enumerate_perpetual = slice_enumerate_perpetual, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Created slice %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Removed slice %s.", + }, + }, +}; diff --git a/src/core/slice.h b/src/core/slice.h new file mode 100644 index 0000000..4678c08 --- /dev/null +++ b/src/core/slice.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "unit.h" + +typedef struct Slice Slice; + +struct Slice { + Unit meta; + + SliceState state, deserialized_state; + + CGroupContext cgroup_context; +}; + +extern const UnitVTable slice_vtable; + +DEFINE_CAST(SLICE, Slice); diff --git a/src/core/smack-setup.c b/src/core/smack-setup.c new file mode 100644 index 0000000..49b37ae --- /dev/null +++ b/src/core/smack-setup.c @@ -0,0 +1,421 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2013 Intel Corporation + Authors: + Nathaniel Chen <nathaniel.chen@intel.com> +***/ + +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdio_ext.h> +#include <stdlib.h> +#include <string.h> + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "smack-setup.h" +#include "string-util.h" +#include "util.h" + +#if ENABLE_SMACK + +static int write_access2_rules(const char* srcdir) { + _cleanup_close_ int load2_fd = -1, change_fd = -1; + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *entry; + int dfd = -1; + int r = 0; + + load2_fd = open("/sys/fs/smackfs/load2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (load2_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/load2': %m"); + return -errno; /* negative error */ + } + + change_fd = open("/sys/fs/smackfs/change-rule", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (change_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/change-rule': %m"); + return -errno; /* negative error */ + } + + /* write rules to load2 or change-rule from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + int fd; + _cleanup_fclose_ FILE *policy = NULL; + + if (!dirent_is_file(entry)) + continue; + + fd = openat(dfd, entry->d_name, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (r == 0) + r = -errno; + log_warning_errno(errno, "Failed to open '%s': %m", entry->d_name); + continue; + } + + policy = fdopen(fd, "r"); + if (!policy) { + if (r == 0) + r = -errno; + safe_close(fd); + log_error_errno(errno, "Failed to open '%s': %m", entry->d_name); + continue; + } + + /* load2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL, *sbj = NULL, *obj = NULL, *acc1 = NULL, *acc2 = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name); + if (q == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, buf[0])) + continue; + + /* if 3 args -> load rule : subject object access1 */ + /* if 4 args -> change rule : subject object access1 access2 */ + if (sscanf(buf, "%ms %ms %ms %ms", &sbj, &obj, &acc1, &acc2) < 3) { + log_error_errno(errno, "Failed to parse rule '%s' in '%s', ignoring.", buf, entry->d_name); + continue; + } + + if (write(isempty(acc2) ? load2_fd : change_fd, buf, strlen(buf)) < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to write '%s' to '%s' in '%s': %m", + buf, isempty(acc2) ? "/sys/fs/smackfs/load2" : "/sys/fs/smackfs/change-rule", entry->d_name); + } + } + } + + return r; +} + +static int write_cipso2_rules(const char* srcdir) { + _cleanup_close_ int cipso2_fd = -1; + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *entry; + int dfd = -1; + int r = 0; + + cipso2_fd = open("/sys/fs/smackfs/cipso2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (cipso2_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/cipso2': %m"); + return -errno; /* negative error */ + } + + /* write rules to cipso2 from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + int fd; + _cleanup_fclose_ FILE *policy = NULL; + + if (!dirent_is_file(entry)) + continue; + + fd = openat(dfd, entry->d_name, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to open '%s': %m", entry->d_name); + continue; + } + + policy = fdopen(fd, "r"); + if (!policy) { + if (r == 0) + r = -errno; + safe_close(fd); + log_error_errno(errno, "Failed to open '%s': %m", entry->d_name); + continue; + } + + /* cipso2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name); + if (q == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, buf[0])) + continue; + + if (write(cipso2_fd, buf, strlen(buf)) < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to write '%s' to '/sys/fs/smackfs/cipso2' in '%s': %m", + buf, entry->d_name); + break; + } + } + } + + return r; +} + +static int write_netlabel_rules(const char* srcdir) { + _cleanup_fclose_ FILE *dst = NULL; + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *entry; + int dfd = -1; + int r = 0; + + dst = fopen("/sys/fs/smackfs/netlabel", "we"); + if (!dst) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open /sys/fs/smackfs/netlabel: %m"); + return -errno; /* negative error */ + } + + /* write rules to dst from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir %s: %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + int fd; + _cleanup_fclose_ FILE *policy = NULL; + + fd = openat(dfd, entry->d_name, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (r == 0) + r = -errno; + log_warning_errno(errno, "Failed to open %s: %m", entry->d_name); + continue; + } + + policy = fdopen(fd, "r"); + if (!policy) { + if (r == 0) + r = -errno; + safe_close(fd); + log_error_errno(errno, "Failed to open %s: %m", entry->d_name); + continue; + } + + (void) __fsetlocking(policy, FSETLOCKING_BYCALLER); + + /* load2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from %s: %m", entry->d_name); + if (q == 0) + break; + + if (!fputs(buf, dst)) { + if (r == 0) + r = -EINVAL; + log_error_errno(errno, "Failed to write line to /sys/fs/smackfs/netlabel: %m"); + break; + } + q = fflush_and_check(dst); + if (q < 0) { + if (r == 0) + r = q; + log_error_errno(q, "Failed to flush writes to /sys/fs/smackfs/netlabel: %m"); + break; + } + } + } + + return r; +} + +static int write_onlycap_list(void) { + _cleanup_close_ int onlycap_fd = -1; + _cleanup_free_ char *list = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t len = 0, allocated = 0; + int r; + + f = fopen("/etc/smack/onlycap", "re"); + if (!f) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to read '/etc/smack/onlycap': %m"); + + return errno == ENOENT ? ENOENT : -errno; + } + + for (;;) { + _cleanup_free_ char *buf = NULL; + size_t l; + + r = read_line(f, LONG_LINE_MAX, &buf); + if (r < 0) + return log_error_errno(r, "Failed to read line from /etc/smack/onlycap: %m"); + if (r == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, *buf)) + continue; + + l = strlen(buf); + if (!GREEDY_REALLOC(list, allocated, len + l + 1)) + return log_oom(); + + stpcpy(list + len, buf)[0] = ' '; + len += l + 1; + } + + if (len == 0) + return 0; + + list[len - 1] = 0; + + onlycap_fd = open("/sys/fs/smackfs/onlycap", O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (onlycap_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/onlycap': %m"); + return -errno; /* negative error */ + } + + r = write(onlycap_fd, list, len); + if (r < 0) + return log_error_errno(errno, "Failed to write onlycap list(%s) to '/sys/fs/smackfs/onlycap': %m", list); + + return 0; +} + +#endif + +int mac_smack_setup(bool *loaded_policy) { + +#if ENABLE_SMACK + + int r; + + assert(loaded_policy); + + r = write_access2_rules("/etc/smack/accesses.d/"); + switch(r) { + case -ENOENT: + log_debug("Smack is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack access rules directory '/etc/smack/accesses.d/' not found"); + return 0; + case 0: + log_info("Successfully loaded Smack policies."); + break; + default: + log_warning_errno(r, "Failed to load Smack access rules, ignoring: %m"); + return 0; + } + +#ifdef SMACK_RUN_LABEL + r = write_string_file("/proc/self/attr/current", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK label \"" SMACK_RUN_LABEL "\" on self: %m"); + r = write_string_file("/sys/fs/smackfs/ambient", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK ambient label \"" SMACK_RUN_LABEL "\": %m"); + r = write_string_file("/sys/fs/smackfs/netlabel", + "0.0.0.0/0 " SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK netlabel rule \"0.0.0.0/0 " SMACK_RUN_LABEL "\": %m"); + r = write_string_file("/sys/fs/smackfs/netlabel", "127.0.0.1 -CIPSO", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK netlabel rule \"127.0.0.1 -CIPSO\": %m"); +#endif + + r = write_cipso2_rules("/etc/smack/cipso.d/"); + switch(r) { + case -ENOENT: + log_debug("Smack/CIPSO is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack/CIPSO access rules directory '/etc/smack/cipso.d/' not found"); + break; + case 0: + log_info("Successfully loaded Smack/CIPSO policies."); + break; + default: + log_warning_errno(r, "Failed to load Smack/CIPSO access rules, ignoring: %m"); + break; + } + + r = write_netlabel_rules("/etc/smack/netlabel.d/"); + switch(r) { + case -ENOENT: + log_debug("Smack/CIPSO is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack network host rules directory '/etc/smack/netlabel.d/' not found"); + break; + case 0: + log_info("Successfully loaded Smack network host rules."); + break; + default: + log_warning_errno(r, "Failed to load Smack network host rules: %m, ignoring."); + break; + } + + r = write_onlycap_list(); + switch(r) { + case -ENOENT: + log_debug("Smack is not enabled in the kernel."); + break; + case ENOENT: + log_debug("Smack onlycap list file '/etc/smack/onlycap' not found"); + break; + case 0: + log_info("Successfully wrote Smack onlycap list."); + break; + default: + log_emergency_errno(r, "Failed to write Smack onlycap list: %m"); + return r; + } + + *loaded_policy = true; + +#endif + + return 0; +} diff --git a/src/core/smack-setup.h b/src/core/smack-setup.h new file mode 100644 index 0000000..b65daaf --- /dev/null +++ b/src/core/smack-setup.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation + Authors: + Nathaniel Chen <nathaniel.chen@intel.com> +***/ + +int mac_smack_setup(bool *loaded_policy); diff --git a/src/core/socket.c b/src/core/socket.c new file mode 100644 index 0000000..af95e90 --- /dev/null +++ b/src/core/socket.c @@ -0,0 +1,3339 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <arpa/inet.h> +#include <errno.h> +#include <fcntl.h> +#include <mqueue.h> +#include <netinet/tcp.h> +#include <signal.h> +#include <sys/epoll.h> +#include <sys/stat.h> +#include <unistd.h> +#include <linux/sctp.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-error.h" +#include "bus-util.h" +#include "copy.h" +#include "dbus-socket.h" +#include "dbus-unit.h" +#include "def.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "in-addr-util.h" +#include "io-util.h" +#include "ip-protocol-list.h" +#include "label.h" +#include "log.h" +#include "missing.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "serialize.h" +#include "signal-util.h" +#include "smack-util.h" +#include "socket.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" + +struct SocketPeer { + unsigned n_ref; + + Socket *socket; + union sockaddr_union peer; + socklen_t peer_salen; +}; + +static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = { + [SOCKET_DEAD] = UNIT_INACTIVE, + [SOCKET_START_PRE] = UNIT_ACTIVATING, + [SOCKET_START_CHOWN] = UNIT_ACTIVATING, + [SOCKET_START_POST] = UNIT_ACTIVATING, + [SOCKET_LISTENING] = UNIT_ACTIVE, + [SOCKET_RUNNING] = UNIT_ACTIVE, + [SOCKET_STOP_PRE] = UNIT_DEACTIVATING, + [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_STOP_POST] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_FAILED] = UNIT_FAILED +}; + +static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); + +static void socket_init(Unit *u) { + Socket *s = SOCKET(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->backlog = SOMAXCONN; + s->timeout_usec = u->manager->default_timeout_start_usec; + s->directory_mode = 0755; + s->socket_mode = 0666; + + s->max_connections = 64; + + s->priority = -1; + s->ip_tos = -1; + s->ip_ttl = -1; + s->mark = -1; + + s->exec_context.std_output = u->manager->default_std_output; + s->exec_context.std_error = u->manager->default_std_error; + + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + s->trigger_limit.interval = USEC_INFINITY; + s->trigger_limit.burst = (unsigned) -1; +} + +static void socket_unwatch_control_pid(Socket *s) { + assert(s); + + if (s->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->control_pid); + s->control_pid = 0; +} + +static void socket_cleanup_fd_list(SocketPort *p) { + assert(p); + + close_many(p->auxiliary_fds, p->n_auxiliary_fds); + p->auxiliary_fds = mfree(p->auxiliary_fds); + p->n_auxiliary_fds = 0; +} + +void socket_free_ports(Socket *s) { + SocketPort *p; + + assert(s); + + while ((p = s->ports)) { + LIST_REMOVE(port, s->ports, p); + + sd_event_source_unref(p->event_source); + + socket_cleanup_fd_list(p); + safe_close(p->fd); + free(p->path); + free(p); + } +} + +static void socket_done(Unit *u) { + Socket *s = SOCKET(u); + SocketPeer *p; + + assert(s); + + socket_free_ports(s); + + while ((p = set_steal_first(s->peers_by_address))) + p->socket = NULL; + + s->peers_by_address = set_free(s->peers_by_address); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, false); + exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); + s->control_command = NULL; + + dynamic_creds_unref(&s->dynamic_creds); + + socket_unwatch_control_pid(s); + + unit_ref_unset(&s->service); + + s->tcp_congestion = mfree(s->tcp_congestion); + s->bind_to_device = mfree(s->bind_to_device); + + s->smack = mfree(s->smack); + s->smack_ip_in = mfree(s->smack_ip_in); + s->smack_ip_out = mfree(s->smack_ip_out); + + strv_free(s->symlinks); + + s->user = mfree(s->user); + s->group = mfree(s->group); + + s->fdname = mfree(s->fdname); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); +} + +static int socket_arm_timer(Socket *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + socket_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "socket-timer"); + + return 0; +} + +int socket_instantiate_service(Socket *s) { + _cleanup_free_ char *prefix = NULL, *name = NULL; + int r; + Unit *u; + + assert(s); + + /* This fills in s->service if it isn't filled in yet. For + * Accept=yes sockets we create the next connection service + * here. For Accept=no this is mostly a NOP since the service + * is figured out at load time anyway. */ + + if (UNIT_DEREF(s->service)) + return 0; + + if (!s->accept) + return 0; + + r = unit_name_to_prefix(UNIT(s)->id, &prefix); + if (r < 0) + return r; + + if (asprintf(&name, "%s@%u.service", prefix, s->n_accepted) < 0) + return -ENOMEM; + + r = manager_load_unit(UNIT(s)->manager, name, NULL, NULL, &u); + if (r < 0) + return r; + + unit_ref_set(&s->service, UNIT(s), u); + + return unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, u, false, UNIT_DEPENDENCY_IMPLICIT); +} + +static bool have_non_accept_socket(Socket *s) { + SocketPort *p; + + assert(s); + + if (!s->accept) + return true; + + LIST_FOREACH(port, p, s->ports) { + + if (p->type != SOCKET_SOCKET) + return true; + + if (!socket_address_can_accept(&p->address)) + return true; + } + + return false; +} + +static int socket_add_mount_dependencies(Socket *s) { + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + const char *path = NULL; + + if (p->type == SOCKET_SOCKET) + path = socket_address_get_path(&p->address); + else if (IN_SET(p->type, SOCKET_FIFO, SOCKET_SPECIAL, SOCKET_USB_FUNCTION)) + path = p->path; + + if (!path) + continue; + + r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + return 0; +} + +static int socket_add_device_dependencies(Socket *s) { + char *t; + + assert(s); + + if (!s->bind_to_device || streq(s->bind_to_device, "lo")) + return 0; + + t = strjoina("/sys/subsystem/net/devices/", s->bind_to_device); + return unit_add_node_dependency(UNIT(s), t, false, UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE); +} + +static int socket_add_default_dependencies(Socket *s) { + int r; + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SOCKETS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +_pure_ static bool socket_has_exec(Socket *s) { + unsigned i; + assert(s); + + for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++) + if (s->exec_command[i]) + return true; + + return false; +} + +static int socket_add_extras(Socket *s) { + Unit *u = UNIT(s); + int r; + + assert(s); + + /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit + * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept() + * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly + * and reliably. This is different for Accept=no, where the spawned service has to take the incoming traffic + * off the queues, which it might not necessarily do. Moreover, while Accept=no services are supposed to + * process whatever is queued in one go, and thus should normally never have to be started frequently. This is + * different for Accept=yes where each connection is processed by a new service instance, and thus frequent + * service starts are typical. */ + + if (s->trigger_limit.interval == USEC_INFINITY) + s->trigger_limit.interval = 2 * USEC_PER_SEC; + + if (s->trigger_limit.burst == (unsigned) -1) { + if (s->accept) + s->trigger_limit.burst = 200; + else + s->trigger_limit.burst = 20; + } + + if (have_non_accept_socket(s)) { + + if (!UNIT_DEREF(s->service)) { + Unit *x; + + r = unit_load_related_unit(u, ".service", &x); + if (r < 0) + return r; + + unit_ref_set(&s->service, u, x); + } + + r = unit_add_two_dependencies(u, UNIT_BEFORE, UNIT_TRIGGERS, UNIT_DEREF(s->service), true, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + } + + r = socket_add_mount_dependencies(s); + if (r < 0) + return r; + + r = socket_add_device_dependencies(s); + if (r < 0) + return r; + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + if (socket_has_exec(s)) { + r = unit_add_exec_dependencies(u, &s->exec_context); + if (r < 0) + return r; + } + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = socket_add_default_dependencies(s); + if (r < 0) + return r; + + return 0; +} + +static const char *socket_find_symlink_target(Socket *s) { + const char *found = NULL; + SocketPort *p; + + LIST_FOREACH(port, p, s->ports) { + const char *f = NULL; + + switch (p->type) { + + case SOCKET_FIFO: + f = p->path; + break; + + case SOCKET_SOCKET: + f = socket_address_get_path(&p->address); + break; + + default: + break; + } + + if (f) { + if (found) + return NULL; + + found = f; + } + } + + return found; +} + +static int socket_verify(Socket *s) { + assert(s); + + if (UNIT(s)->load_state != UNIT_LOADED) + return 0; + + if (!s->ports) { + log_unit_error(UNIT(s), "Unit has no Listen setting (ListenStream=, ListenDatagram=, ListenFIFO=, ...). Refusing."); + return -ENOEXEC; + } + + if (s->accept && have_non_accept_socket(s)) { + log_unit_error(UNIT(s), "Unit configured for accepting sockets, but sockets are non-accepting. Refusing."); + return -ENOEXEC; + } + + if (s->accept && s->max_connections <= 0) { + log_unit_error(UNIT(s), "MaxConnection= setting too small. Refusing."); + return -ENOEXEC; + } + + if (s->accept && UNIT_DEREF(s->service)) { + log_unit_error(UNIT(s), "Explicit service configuration for accepting socket units not supported. Refusing."); + return -ENOEXEC; + } + + if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP) { + log_unit_error(UNIT(s), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing."); + return -ENOEXEC; + } + + if (!strv_isempty(s->symlinks) && !socket_find_symlink_target(s)) { + log_unit_error(UNIT(s), "Unit has symlinks set but none or more than one node in the file system. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) { + assert(s); + + if (s->peer.sa.sa_family == AF_INET) + siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state); + else if (s->peer.sa.sa_family == AF_INET6) + siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state); + else if (s->peer.sa.sa_family == AF_VSOCK) + siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state); + else + assert_not_reached("Unknown address family."); +} + +static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) { + int r; + + r = CMP(x->peer.sa.sa_family, y->peer.sa.sa_family); + if (r != 0) + return r; + + switch(x->peer.sa.sa_family) { + case AF_INET: + return memcmp(&x->peer.in.sin_addr, &y->peer.in.sin_addr, sizeof(x->peer.in.sin_addr)); + case AF_INET6: + return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr)); + case AF_VSOCK: + return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid); + } + assert_not_reached("Black sheep in the family!"); +} + +DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func); + +static int socket_load(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = set_ensure_allocated(&s->peers_by_address, &peer_address_hash_ops); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_LOADED) { + /* This is a new unit? Then let's add in some extras */ + r = socket_add_extras(s); + if (r < 0) + return r; + } + + return socket_verify(s); +} + +static SocketPeer *socket_peer_new(void) { + SocketPeer *p; + + p = new0(SocketPeer, 1); + if (!p) + return NULL; + + p->n_ref = 1; + + return p; +} + +static SocketPeer *socket_peer_free(SocketPeer *p) { + assert(p); + + if (p->socket) + set_remove(p->socket->peers_by_address, p); + + return mfree(p); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free); + +int socket_acquire_peer(Socket *s, int fd, SocketPeer **p) { + _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL; + SocketPeer sa = {}, *i; + socklen_t salen = sizeof(sa.peer); + int r; + + assert(fd >= 0); + assert(s); + + r = getpeername(fd, &sa.peer.sa, &salen); + if (r < 0) + return log_unit_error_errno(UNIT(s), errno, "getpeername failed: %m"); + + if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + *p = NULL; + return 0; + } + + i = set_get(s->peers_by_address, &sa); + if (i) { + *p = socket_peer_ref(i); + return 1; + } + + remote = socket_peer_new(); + if (!remote) + return log_oom(); + + remote->peer = sa.peer; + remote->peer_salen = salen; + + r = set_put(s->peers_by_address, remote); + if (r < 0) + return r; + + remote->socket = s; + + *p = TAKE_PTR(remote); + + return 1; +} + +_const_ static const char* listen_lookup(int family, int type) { + + if (family == AF_NETLINK) + return "ListenNetlink"; + + if (type == SOCK_STREAM) + return "ListenStream"; + else if (type == SOCK_DGRAM) + return "ListenDatagram"; + else if (type == SOCK_SEQPACKET) + return "ListenSequentialPacket"; + + assert_not_reached("Unknown socket type"); + return NULL; +} + +static void socket_dump(Unit *u, FILE *f, const char *prefix) { + char time_string[FORMAT_TIMESPAN_MAX]; + SocketExecCommand c; + Socket *s = SOCKET(u); + SocketPort *p; + const char *prefix2, *str; + + assert(s); + assert(f); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%sSocket State: %s\n" + "%sResult: %s\n" + "%sBindIPv6Only: %s\n" + "%sBacklog: %u\n" + "%sSocketMode: %04o\n" + "%sDirectoryMode: %04o\n" + "%sKeepAlive: %s\n" + "%sNoDelay: %s\n" + "%sFreeBind: %s\n" + "%sTransparent: %s\n" + "%sBroadcast: %s\n" + "%sPassCredentials: %s\n" + "%sPassSecurity: %s\n" + "%sTCPCongestion: %s\n" + "%sRemoveOnStop: %s\n" + "%sWritable: %s\n" + "%sFileDescriptorName: %s\n" + "%sSELinuxContextFromNet: %s\n", + prefix, socket_state_to_string(s->state), + prefix, socket_result_to_string(s->result), + prefix, socket_address_bind_ipv6_only_to_string(s->bind_ipv6_only), + prefix, s->backlog, + prefix, s->socket_mode, + prefix, s->directory_mode, + prefix, yes_no(s->keep_alive), + prefix, yes_no(s->no_delay), + prefix, yes_no(s->free_bind), + prefix, yes_no(s->transparent), + prefix, yes_no(s->broadcast), + prefix, yes_no(s->pass_cred), + prefix, yes_no(s->pass_sec), + prefix, strna(s->tcp_congestion), + prefix, yes_no(s->remove_on_stop), + prefix, yes_no(s->writable), + prefix, socket_fdname(s), + prefix, yes_no(s->selinux_context_from_net)); + + if (s->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid); + + if (s->bind_to_device) + fprintf(f, + "%sBindToDevice: %s\n", + prefix, s->bind_to_device); + + if (s->accept) + fprintf(f, + "%sAccepted: %u\n" + "%sNConnections: %u\n" + "%sMaxConnections: %u\n" + "%sMaxConnectionsPerSource: %u\n", + prefix, s->n_accepted, + prefix, s->n_connections, + prefix, s->max_connections, + prefix, s->max_connections_per_source); + + if (s->priority >= 0) + fprintf(f, + "%sPriority: %i\n", + prefix, s->priority); + + if (s->receive_buffer > 0) + fprintf(f, + "%sReceiveBuffer: %zu\n", + prefix, s->receive_buffer); + + if (s->send_buffer > 0) + fprintf(f, + "%sSendBuffer: %zu\n", + prefix, s->send_buffer); + + if (s->ip_tos >= 0) + fprintf(f, + "%sIPTOS: %i\n", + prefix, s->ip_tos); + + if (s->ip_ttl >= 0) + fprintf(f, + "%sIPTTL: %i\n", + prefix, s->ip_ttl); + + if (s->pipe_size > 0) + fprintf(f, + "%sPipeSize: %zu\n", + prefix, s->pipe_size); + + if (s->mark >= 0) + fprintf(f, + "%sMark: %i\n", + prefix, s->mark); + + if (s->mq_maxmsg > 0) + fprintf(f, + "%sMessageQueueMaxMessages: %li\n", + prefix, s->mq_maxmsg); + + if (s->mq_msgsize > 0) + fprintf(f, + "%sMessageQueueMessageSize: %li\n", + prefix, s->mq_msgsize); + + if (s->reuse_port) + fprintf(f, + "%sReusePort: %s\n", + prefix, yes_no(s->reuse_port)); + + if (s->smack) + fprintf(f, + "%sSmackLabel: %s\n", + prefix, s->smack); + + if (s->smack_ip_in) + fprintf(f, + "%sSmackLabelIPIn: %s\n", + prefix, s->smack_ip_in); + + if (s->smack_ip_out) + fprintf(f, + "%sSmackLabelIPOut: %s\n", + prefix, s->smack_ip_out); + + if (!isempty(s->user) || !isempty(s->group)) + fprintf(f, + "%sSocketUser: %s\n" + "%sSocketGroup: %s\n", + prefix, strna(s->user), + prefix, strna(s->group)); + + if (s->keep_alive_time > 0) + fprintf(f, + "%sKeepAliveTimeSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->keep_alive_time, USEC_PER_SEC)); + + if (s->keep_alive_interval > 0) + fprintf(f, + "%sKeepAliveIntervalSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->keep_alive_interval, USEC_PER_SEC)); + + if (s->keep_alive_cnt > 0) + fprintf(f, + "%sKeepAliveProbes: %u\n", + prefix, s->keep_alive_cnt); + + if (s->defer_accept > 0) + fprintf(f, + "%sDeferAcceptSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->defer_accept, USEC_PER_SEC)); + + LIST_FOREACH(port, p, s->ports) { + + switch (p->type) { + case SOCKET_SOCKET: { + _cleanup_free_ char *k = NULL; + const char *t; + int r; + + r = socket_address_print(&p->address, &k); + if (r < 0) + t = strerror(-r); + else + t = k; + + fprintf(f, "%s%s: %s\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type), t); + break; + } + case SOCKET_SPECIAL: + fprintf(f, "%sListenSpecial: %s\n", prefix, p->path); + break; + case SOCKET_USB_FUNCTION: + fprintf(f, "%sListenUSBFunction: %s\n", prefix, p->path); + break; + case SOCKET_MQUEUE: + fprintf(f, "%sListenMessageQueue: %s\n", prefix, p->path); + break; + default: + fprintf(f, "%sListenFIFO: %s\n", prefix, p->path); + } + } + + fprintf(f, + "%sTriggerLimitIntervalSec: %s\n" + "%sTriggerLimitBurst: %u\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->trigger_limit.interval, USEC_PER_SEC), + prefix, s->trigger_limit.burst); + + str = ip_protocol_to_name(s->socket_protocol); + if (str) + fprintf(f, "%sSocketProtocol: %s\n", prefix, str); + + if (!strv_isempty(s->symlinks)) { + char **q; + + fprintf(f, "%sSymlinks:", prefix); + STRV_FOREACH(q, s->symlinks) + fprintf(f, " %s", *q); + + fprintf(f, "\n"); + } + + fprintf(f, + "%sTimeoutSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->timeout_usec, USEC_PER_SEC)); + + exec_context_dump(&s->exec_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); + + for (c = 0; c < _SOCKET_EXEC_COMMAND_MAX; c++) { + if (!s->exec_command[c]) + continue; + + fprintf(f, "%s-> %s:\n", + prefix, socket_exec_command_to_string(c)); + + exec_command_dump_list(s->exec_command[c], f, prefix2); + } + + cgroup_context_dump(&s->cgroup_context, f, prefix); +} + +static int instance_from_socket(int fd, unsigned nr, char **instance) { + socklen_t l; + char *r; + union sockaddr_union local, remote; + + assert(fd >= 0); + assert(instance); + + l = sizeof(local); + if (getsockname(fd, &local.sa, &l) < 0) + return -errno; + + l = sizeof(remote); + if (getpeername(fd, &remote.sa, &l) < 0) + return -errno; + + switch (local.sa.sa_family) { + + case AF_INET: { + uint32_t + a = be32toh(local.in.sin_addr.s_addr), + b = be32toh(remote.in.sin_addr.s_addr); + + if (asprintf(&r, + "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u", + nr, + a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF, + be16toh(local.in.sin_port), + b >> 24, (b >> 16) & 0xFF, (b >> 8) & 0xFF, b & 0xFF, + be16toh(remote.in.sin_port)) < 0) + return -ENOMEM; + + break; + } + + case AF_INET6: { + static const unsigned char ipv4_prefix[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF + }; + + if (memcmp(&local.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0 && + memcmp(&remote.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) { + const uint8_t + *a = local.in6.sin6_addr.s6_addr+12, + *b = remote.in6.sin6_addr.s6_addr+12; + + if (asprintf(&r, + "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u", + nr, + a[0], a[1], a[2], a[3], + be16toh(local.in6.sin6_port), + b[0], b[1], b[2], b[3], + be16toh(remote.in6.sin6_port)) < 0) + return -ENOMEM; + } else { + char a[INET6_ADDRSTRLEN], b[INET6_ADDRSTRLEN]; + + if (asprintf(&r, + "%u-%s:%u-%s:%u", + nr, + inet_ntop(AF_INET6, &local.in6.sin6_addr, a, sizeof(a)), + be16toh(local.in6.sin6_port), + inet_ntop(AF_INET6, &remote.in6.sin6_addr, b, sizeof(b)), + be16toh(remote.in6.sin6_port)) < 0) + return -ENOMEM; + } + + break; + } + + case AF_UNIX: { + struct ucred ucred; + int k; + + k = getpeercred(fd, &ucred); + if (k >= 0) { + if (asprintf(&r, + "%u-"PID_FMT"-"UID_FMT, + nr, ucred.pid, ucred.uid) < 0) + return -ENOMEM; + } else if (k == -ENODATA) { + /* This handles the case where somebody is + * connecting from another pid/uid namespace + * (e.g. from outside of our container). */ + if (asprintf(&r, + "%u-unknown", + nr) < 0) + return -ENOMEM; + } else + return k; + + break; + } + + case AF_VSOCK: + if (asprintf(&r, + "%u-%u:%u-%u:%u", + nr, + local.vm.svm_cid, local.vm.svm_port, + remote.vm.svm_cid, remote.vm.svm_port) < 0) + return -ENOMEM; + + break; + + default: + assert_not_reached("Unhandled socket type."); + } + + *instance = r; + return 0; +} + +static void socket_close_fds(Socket *s) { + SocketPort *p; + char **i; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + bool was_open; + + was_open = p->fd >= 0; + + p->event_source = sd_event_source_unref(p->event_source); + p->fd = safe_close(p->fd); + socket_cleanup_fd_list(p); + + /* One little note: we should normally not delete any sockets in the file system here! After all some + * other process we spawned might still have a reference of this fd and wants to continue to use + * it. Therefore we normally delete sockets in the file system before we create a new one, not after we + * stopped using one! That all said, if the user explicitly requested this, we'll delete them here + * anyway, but only then. */ + + if (!was_open || !s->remove_on_stop) + continue; + + switch (p->type) { + + case SOCKET_FIFO: + (void) unlink(p->path); + break; + + case SOCKET_MQUEUE: + (void) mq_unlink(p->path); + break; + + case SOCKET_SOCKET: + (void) socket_address_unlink(&p->address); + break; + + default: + break; + } + } + + if (s->remove_on_stop) + STRV_FOREACH(i, s->symlinks) + (void) unlink(*i); +} + +static void socket_apply_socket_options(Socket *s, int fd) { + int r; + + assert(s); + assert(fd >= 0); + + if (s->keep_alive) { + r = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_KEEPALIVE failed: %m"); + } + + if (s->keep_alive_time > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPIDLE, s->keep_alive_time / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPIDLE failed: %m"); + } + + if (s->keep_alive_interval > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPINTVL, s->keep_alive_interval / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPINTVL failed: %m"); + } + + if (s->keep_alive_cnt > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPCNT, s->keep_alive_cnt); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPCNT failed: %m"); + } + + if (s->defer_accept > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_DEFER_ACCEPT, s->defer_accept / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_DEFER_ACCEPT failed: %m"); + } + + if (s->no_delay) { + if (s->socket_protocol == IPPROTO_SCTP) { + r = setsockopt_int(fd, SOL_SCTP, SCTP_NODELAY, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SCTP_NODELAY failed: %m"); + } else { + r = setsockopt_int(fd, SOL_TCP, TCP_NODELAY, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_NODELAY failed: %m"); + } + } + + if (s->broadcast) { + r = setsockopt_int(fd, SOL_SOCKET, SO_BROADCAST, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_BROADCAST failed: %m"); + } + + if (s->pass_cred) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PASSCRED failed: %m"); + } + + if (s->pass_sec) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PASSSEC failed: %m"); + } + + if (s->priority >= 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PRIORITY, s->priority); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PRIORITY failed: %m"); + } + + if (s->receive_buffer > 0) { + /* We first try with SO_RCVBUFFORCE, in case we have the perms for that */ + if (setsockopt_int(fd, SOL_SOCKET, SO_RCVBUFFORCE, s->receive_buffer) < 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_RCVBUF, s->receive_buffer); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_RCVBUF failed: %m"); + } + } + + if (s->send_buffer > 0) { + if (setsockopt_int(fd, SOL_SOCKET, SO_SNDBUFFORCE, s->send_buffer) < 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_SNDBUF, s->send_buffer); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_SNDBUF failed: %m"); + } + } + + if (s->mark >= 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_MARK, s->mark); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_MARK failed: %m"); + } + + if (s->ip_tos >= 0) { + r = setsockopt_int(fd, IPPROTO_IP, IP_TOS, s->ip_tos); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "IP_TOS failed: %m"); + } + + if (s->ip_ttl >= 0) { + int x; + + r = setsockopt_int(fd, IPPROTO_IP, IP_TTL, s->ip_ttl); + + if (socket_ipv6_is_supported()) + x = setsockopt_int(fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS, s->ip_ttl); + else + x = -EAFNOSUPPORT; + + if (r < 0 && x < 0) + log_unit_warning_errno(UNIT(s), r, "IP_TTL/IPV6_UNICAST_HOPS failed: %m"); + } + + if (s->tcp_congestion) + if (setsockopt(fd, SOL_TCP, TCP_CONGESTION, s->tcp_congestion, strlen(s->tcp_congestion)+1) < 0) + log_unit_warning_errno(UNIT(s), errno, "TCP_CONGESTION failed: %m"); + + if (s->smack_ip_in) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_IPIN, s->smack_ip_in); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_in_fd: %m"); + } + + if (s->smack_ip_out) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_IPOUT, s->smack_ip_out); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_out_fd: %m"); + } +} + +static void socket_apply_fifo_options(Socket *s, int fd) { + int r; + + assert(s); + assert(fd >= 0); + + if (s->pipe_size > 0) + if (fcntl(fd, F_SETPIPE_SZ, s->pipe_size) < 0) + log_unit_warning_errno(UNIT(s), errno, "Setting pipe size failed, ignoring: %m"); + + if (s->smack) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_ACCESS, s->smack); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "SMACK relabelling failed, ignoring: %m"); + } +} + +static int fifo_address_create( + const char *path, + mode_t directory_mode, + mode_t socket_mode) { + + _cleanup_close_ int fd = -1; + mode_t old_mask; + struct stat st; + int r; + + assert(path); + + (void) mkdir_parents_label(path, directory_mode); + + r = mac_selinux_create_file_prepare(path, S_IFIFO); + if (r < 0) + return r; + + /* Enforce the right access mode for the fifo */ + old_mask = umask(~socket_mode); + + /* Include the original umask in our mask */ + (void) umask(~socket_mode | old_mask); + + r = mkfifo(path, socket_mode); + (void) umask(old_mask); + + if (r < 0 && errno != EEXIST) { + r = -errno; + goto fail; + } + + fd = open(path, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK | O_NOFOLLOW); + if (fd < 0) { + r = -errno; + goto fail; + } + + mac_selinux_create_file_clear(); + + if (fstat(fd, &st) < 0) { + r = -errno; + goto fail; + } + + if (!S_ISFIFO(st.st_mode) || + (st.st_mode & 0777) != (socket_mode & ~old_mask) || + st.st_uid != getuid() || + st.st_gid != getgid()) { + r = -EEXIST; + goto fail; + } + + return TAKE_FD(fd); + +fail: + mac_selinux_create_file_clear(); + return r; +} + +static int special_address_create(const char *path, bool writable) { + _cleanup_close_ int fd = -1; + struct stat st; + + assert(path); + + fd = open(path, (writable ? O_RDWR : O_RDONLY)|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check whether this is a /proc, /sys or /dev file or char device */ + if (!S_ISREG(st.st_mode) && !S_ISCHR(st.st_mode)) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int usbffs_address_create(const char *path) { + _cleanup_close_ int fd = -1; + struct stat st; + + assert(path); + + fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check whether this is a regular file (ffs endpoint) */ + if (!S_ISREG(st.st_mode)) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int mq_address_create( + const char *path, + mode_t mq_mode, + long maxmsg, + long msgsize) { + + _cleanup_close_ int fd = -1; + struct stat st; + mode_t old_mask; + struct mq_attr _attr, *attr = NULL; + + assert(path); + + if (maxmsg > 0 && msgsize > 0) { + _attr = (struct mq_attr) { + .mq_flags = O_NONBLOCK, + .mq_maxmsg = maxmsg, + .mq_msgsize = msgsize, + }; + attr = &_attr; + } + + /* Enforce the right access mode for the mq */ + old_mask = umask(~mq_mode); + + /* Include the original umask in our mask */ + (void) umask(~mq_mode | old_mask); + fd = mq_open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_CREAT, mq_mode, attr); + (void) umask(old_mask); + + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if ((st.st_mode & 0777) != (mq_mode & ~old_mask) || + st.st_uid != getuid() || + st.st_gid != getgid()) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int socket_symlink(Socket *s) { + const char *p; + char **i; + int r; + + assert(s); + + p = socket_find_symlink_target(s); + if (!p) + return 0; + + STRV_FOREACH(i, s->symlinks) { + (void) mkdir_parents_label(*i, s->directory_mode); + + r = symlink_idempotent(p, *i, false); + + if (r == -EEXIST && s->remove_on_stop) { + /* If there's already something where we want to create the symlink, and the destructive + * RemoveOnStop= mode is set, then we might as well try to remove what already exists and try + * again. */ + + if (unlink(*i) >= 0) + r = symlink_idempotent(p, *i, false); + } + + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to create symlink %s → %s, ignoring: %m", p, *i); + } + + return 0; +} + +static int usbffs_write_descs(int fd, Service *s) { + int r; + + if (!s->usb_function_descriptors || !s->usb_function_strings) + return -EINVAL; + + r = copy_file_fd(s->usb_function_descriptors, fd, 0); + if (r < 0) + return r; + + return copy_file_fd(s->usb_function_strings, fd, 0); +} + +static int usbffs_select_ep(const struct dirent *d) { + return d->d_name[0] != '.' && !streq(d->d_name, "ep0"); +} + +static int usbffs_dispatch_eps(SocketPort *p) { + _cleanup_free_ struct dirent **ent = NULL; + size_t n, k, i; + int r; + + r = scandir(p->path, &ent, usbffs_select_ep, alphasort); + if (r < 0) + return -errno; + + n = (size_t) r; + p->auxiliary_fds = new(int, n); + if (!p->auxiliary_fds) { + r = -ENOMEM; + goto clear; + } + + p->n_auxiliary_fds = n; + + k = 0; + for (i = 0; i < n; ++i) { + _cleanup_free_ char *ep = NULL; + + ep = path_make_absolute(ent[i]->d_name, p->path); + if (!ep) { + r = -ENOMEM; + goto fail; + } + + path_simplify(ep, false); + + r = usbffs_address_create(ep); + if (r < 0) + goto fail; + + p->auxiliary_fds[k++] = r; + } + + r = 0; + goto clear; + +fail: + close_many(p->auxiliary_fds, k); + p->auxiliary_fds = mfree(p->auxiliary_fds); + p->n_auxiliary_fds = 0; + +clear: + for (i = 0; i < n; ++i) + free(ent[i]); + + return r; +} + +static int socket_determine_selinux_label(Socket *s, char **ret) { + Service *service; + ExecCommand *c; + _cleanup_free_ char *path = NULL; + int r; + + assert(s); + assert(ret); + + if (s->selinux_context_from_net) { + /* If this is requested, get label from the network label */ + + r = mac_selinux_get_our_label(ret); + if (r == -EOPNOTSUPP) + goto no_label; + + } else { + /* Otherwise, get it from the executable we are about to start */ + r = socket_instantiate_service(s); + if (r < 0) + return r; + + if (!UNIT_ISSET(s->service)) + goto no_label; + + service = SERVICE(UNIT_DEREF(s->service)); + c = service->exec_command[SERVICE_EXEC_START]; + if (!c) + goto no_label; + + r = chase_symlinks(c->path, service->exec_context.root_directory, CHASE_PREFIX_ROOT, &path); + if (r < 0) + goto no_label; + + r = mac_selinux_get_create_label_from_exe(path, ret); + if (IN_SET(r, -EPERM, -EOPNOTSUPP)) + goto no_label; + } + + return r; + +no_label: + *ret = NULL; + return 0; +} + +static int socket_address_listen_do( + Socket *s, + const SocketAddress *address, + const char *label) { + + assert(s); + assert(address); + + return socket_address_listen( + address, + SOCK_CLOEXEC|SOCK_NONBLOCK, + s->backlog, + s->bind_ipv6_only, + s->bind_to_device, + s->reuse_port, + s->free_bind, + s->transparent, + s->directory_mode, + s->socket_mode, + label); +} + +#define log_address_error_errno(u, address, error, fmt) \ + ({ \ + _cleanup_free_ char *_t = NULL; \ + \ + (void) socket_address_print(address, &_t); \ + log_unit_error_errno(u, error, fmt, strna(_t)); \ + }) + +static int socket_address_listen_in_cgroup( + Socket *s, + const SocketAddress *address, + const char *label) { + + _cleanup_close_pair_ int pair[2] = { -1, -1 }; + int fd, r; + pid_t pid; + + assert(s); + assert(address); + + /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the socket's cgroup + * in which the socket is actually created. This way we ensure the socket is actually properly attached to the + * unit's cgroup for the purpose of BPF filtering and such. */ + + if (!IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) + goto shortcut; /* BPF filtering only applies to IPv4 + IPv6, shortcut things for other protocols */ + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ + goto shortcut; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), "(sd-listen)", &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + fd = socket_address_listen_do(s, address, label); + if (fd < 0) { + log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], fd, 0); + if (r < 0) { + log_address_error_errno(UNIT(s), address, r, "Failed to send listening socket (%s) to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + fd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_check("(sd-listen)", pid, WAIT_LOG_ABNORMAL); + if (r < 0) { + safe_close(fd); + return r; + } + + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m"); + + return fd; + +shortcut: + fd = socket_address_listen_do(s, address, label); + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + + return fd; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Socket *, socket_close_fds); + +static int socket_open_fds(Socket *_s) { + _cleanup_(socket_close_fdsp) Socket *s = _s; + _cleanup_(mac_selinux_freep) char *label = NULL; + bool know_label = false; + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + + if (p->fd >= 0) + continue; + + switch (p->type) { + + case SOCKET_SOCKET: + + if (!know_label) { + /* Figure out label, if we don't it know yet. We do it once, for the first socket where + * we need this and remember it for the rest. */ + + r = socket_determine_selinux_label(s, &label); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to determine SELinux label: %m"); + + know_label = true; + } + + /* Apply the socket protocol */ + switch (p->address.type) { + + case SOCK_STREAM: + case SOCK_SEQPACKET: + if (s->socket_protocol == IPPROTO_SCTP) + p->address.protocol = s->socket_protocol; + break; + + case SOCK_DGRAM: + if (s->socket_protocol == IPPROTO_UDPLITE) + p->address.protocol = s->socket_protocol; + break; + } + + p->fd = socket_address_listen_in_cgroup(s, &p->address, label); + if (p->fd < 0) + return p->fd; + + socket_apply_socket_options(s, p->fd); + socket_symlink(s); + break; + + case SOCKET_SPECIAL: + + p->fd = special_address_create(p->path, s->writable); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open special file %s: %m", p->path); + break; + + case SOCKET_FIFO: + + p->fd = fifo_address_create( + p->path, + s->directory_mode, + s->socket_mode); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open FIFO %s: %m", p->path); + + socket_apply_fifo_options(s, p->fd); + socket_symlink(s); + break; + + case SOCKET_MQUEUE: + + p->fd = mq_address_create( + p->path, + s->socket_mode, + s->mq_maxmsg, + s->mq_msgsize); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open message queue %s: %m", p->path); + break; + + case SOCKET_USB_FUNCTION: { + _cleanup_free_ char *ep = NULL; + + ep = path_make_absolute("ep0", p->path); + + p->fd = usbffs_address_create(ep); + if (p->fd < 0) + return p->fd; + + r = usbffs_write_descs(p->fd, SERVICE(UNIT_DEREF(s->service))); + if (r < 0) + return r; + + r = usbffs_dispatch_eps(p); + if (r < 0) + return r; + + break; + } + default: + assert_not_reached("Unknown port type"); + } + } + + s = NULL; + return 0; +} + +static void socket_unwatch_fds(Socket *s) { + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + if (!p->event_source) + continue; + + r = sd_event_source_set_enabled(p->event_source, SD_EVENT_OFF); + if (r < 0) + log_unit_debug_errno(UNIT(s), r, "Failed to disable event source: %m"); + } +} + +static int socket_watch_fds(Socket *s) { + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + if (p->event_source) { + r = sd_event_source_set_enabled(p->event_source, SD_EVENT_ON); + if (r < 0) + goto fail; + } else { + r = sd_event_add_io(UNIT(s)->manager->event, &p->event_source, p->fd, EPOLLIN, socket_dispatch_io, p); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(p->event_source, "socket-port-io"); + } + } + + return 0; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to watch listening fds: %m"); + socket_unwatch_fds(s); + return r; +} + +enum { + SOCKET_OPEN_NONE, + SOCKET_OPEN_SOME, + SOCKET_OPEN_ALL, +}; + +static int socket_check_open(Socket *s) { + bool have_open = false, have_closed = false; + SocketPort *p; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + have_closed = true; + else + have_open = true; + + if (have_open && have_closed) + return SOCKET_OPEN_SOME; + } + + if (have_open) + return SOCKET_OPEN_ALL; + + return SOCKET_OPEN_NONE; +} + +static void socket_set_state(Socket *s, SocketState state) { + SocketState old_state; + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!IN_SET(state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL)) { + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + socket_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + } + + if (state != SOCKET_LISTENING) + socket_unwatch_fds(s); + + if (!IN_SET(state, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_LISTENING, + SOCKET_RUNNING, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL)) + socket_close_fds(s); + + if (state != old_state) + log_unit_debug(UNIT(s), "Changed %s -> %s", socket_state_to_string(old_state), socket_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int socket_coldplug(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(s->state == SOCKET_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + if (s->control_pid > 0 && + pid_is_unwaited(s->control_pid) && + IN_SET(s->deserialized_state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL)) { + + r = unit_watch_pid(UNIT(s), s->control_pid); + if (r < 0) + return r; + + r = socket_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec)); + if (r < 0) + return r; + } + + if (IN_SET(s->deserialized_state, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_LISTENING, + SOCKET_RUNNING)) { + + /* Originally, we used to simply reopen all sockets here that we didn't have file descriptors + * for. However, this is problematic, as we won't traverse throught the SOCKET_START_CHOWN state for + * them, and thus the UID/GID wouldn't be right. Hence, instead simply check if we have all fds open, + * and if there's a mismatch, warn loudly. */ + + r = socket_check_open(s); + if (r == SOCKET_OPEN_NONE) + log_unit_warning(UNIT(s), + "Socket unit configuration has changed while unit has been running, " + "no open socket file descriptor left. " + "The socket unit is not functional until restarted."); + else if (r == SOCKET_OPEN_SOME) + log_unit_warning(UNIT(s), + "Socket unit configuration has changed while unit has been running, " + "and some socket file descriptors have not been opened yet. " + "The socket unit is not fully functional until restarted."); + } + + if (s->deserialized_state == SOCKET_LISTENING) { + r = socket_watch_fds(s); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, SOCKET_DEAD, SOCKET_FAILED)) { + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + socket_set_state(s, s->deserialized_state); + return 0; +} + +static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(s)); + if (r < 0) + return r; + + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->dynamic_creds, + &pid); + if (r < 0) + return r; + + r = unit_watch_pid(UNIT(s), pid); + if (r < 0) + /* FIXME: we need to do something here */ + return r; + + *_pid = pid; + + return 0; +} + +static int socket_chown(Socket *s, pid_t *_pid) { + pid_t pid; + int r; + + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + /* We have to resolve the user names out-of-process, hence + * let's fork here. It's messy, but well, what can we do? */ + + r = unit_fork_helper_process(UNIT(s), "(sd-chown)", &pid); + if (r < 0) + return r; + if (r == 0) { + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + SocketPort *p; + + /* Child */ + + if (!isempty(s->user)) { + const char *user = s->user; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user); + _exit(EXIT_USER); + } + } + + if (!isempty(s->group)) { + const char *group = s->group; + + r = get_group_creds(&group, &gid, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group); + _exit(EXIT_GROUP); + } + } + + LIST_FOREACH(port, p, s->ports) { + const char *path = NULL; + + if (p->type == SOCKET_SOCKET) + path = socket_address_get_path(&p->address); + else if (p->type == SOCKET_FIFO) + path = p->path; + + if (!path) + continue; + + if (chown(path, uid, gid) < 0) { + log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m"); + _exit(EXIT_CHOWN); + } + } + + _exit(EXIT_SUCCESS); + } + + r = unit_watch_pid(UNIT(s), pid); + if (r < 0) + goto fail; + + *_pid = pid; + return 0; + +fail: + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + return r; +} + +static void socket_enter_dead(Socket *s, SocketResult f) { + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + if (s->result == SOCKET_SUCCESS) + unit_log_success(UNIT(s)); + else + unit_log_failure(UNIT(s), socket_result_to_string(s->result)); + + socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, true); + + exec_context_destroy_runtime_directory(&s->exec_context, UNIT(s)->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + unit_unref_uid_gid(UNIT(s), true); + + dynamic_creds_destroy(&s->dynamic_creds); +} + +static void socket_enter_signal(Socket *s, SocketState state, SocketResult f); + +static void socket_enter_stop_post(Socket *s, SocketResult f) { + int r; + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_STOP_POST; + s->control_command = s->exec_command[SOCKET_EXEC_STOP_POST]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + socket_set_state(s, SOCKET_STOP_POST); + } else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-post' task: %m"); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) { + int r; + + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + r = unit_kill_context( + UNIT(s), + &s->kill_context, + !IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_FINAL_SIGTERM) ? + KILL_KILL : KILL_TERMINATE, + -1, + s->control_pid, + false); + if (r < 0) + goto fail; + + if (r > 0) { + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + socket_set_state(s, state); + } else if (state == SOCKET_STOP_PRE_SIGTERM) + socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_SUCCESS); + else if (state == SOCKET_STOP_PRE_SIGKILL) + socket_enter_stop_post(s, SOCKET_SUCCESS); + else if (state == SOCKET_FINAL_SIGTERM) + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS); + else + socket_enter_dead(s, SOCKET_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + + if (IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_STOP_PRE_SIGKILL)) + socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES); + else + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_stop_pre(Socket *s, SocketResult f) { + int r; + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_STOP_PRE; + s->control_command = s->exec_command[SOCKET_EXEC_STOP_PRE]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + socket_set_state(s, SOCKET_STOP_PRE); + } else + socket_enter_stop_post(s, SOCKET_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-pre' task: %m"); + socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_listening(Socket *s) { + int r; + assert(s); + + r = socket_watch_fds(s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to watch sockets: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_LISTENING); + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_post(Socket *s) { + int r; + assert(s); + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_START_POST; + s->control_command = s->exec_command[SOCKET_EXEC_START_POST]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-post' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_POST); + } else + socket_enter_listening(s); + + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_chown(Socket *s) { + int r; + + assert(s); + + r = socket_open_fds(s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to listen on sockets: %m"); + goto fail; + } + + if (!isempty(s->user) || !isempty(s->group)) { + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_START_CHOWN; + s->control_command = NULL; + + r = socket_chown(s, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to fork 'start-chown' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_CHOWN); + } else + socket_enter_start_post(s); + + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_pre(Socket *s) { + int r; + assert(s); + + socket_unwatch_control_pid(s); + + unit_warn_leftover_processes(UNIT(s)); + + s->control_command_id = SOCKET_EXEC_START_PRE; + s->control_command = s->exec_command[SOCKET_EXEC_START_PRE]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-pre' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_PRE); + } else + socket_enter_start_chown(s); + + return; + +fail: + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); +} + +static void flush_ports(Socket *s) { + SocketPort *p; + + /* Flush all incoming traffic, regardless if actual bytes or new connections, so that this socket isn't busy + * anymore */ + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + (void) flush_accept(p->fd); + (void) flush_fd(p->fd); + } +} + +static void socket_enter_running(Socket *s, int cfd) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + /* Note that this call takes possession of the connection fd passed. It either has to assign it somewhere or + * close it. */ + + assert(s); + + /* We don't take connections anymore if we are supposed to shut down anyway */ + if (unit_stop_pending(UNIT(s))) { + + log_unit_debug(UNIT(s), "Suppressing connection request since unit stop is scheduled."); + + if (cfd >= 0) + goto refuse; + else + flush_ports(s); + + return; + } + + if (!ratelimit_below(&s->trigger_limit)) { + log_unit_warning(UNIT(s), "Trigger limit hit, refusing further activation."); + socket_enter_stop_pre(s, SOCKET_FAILURE_TRIGGER_LIMIT_HIT); + goto refuse; + } + + if (cfd < 0) { + bool pending = false; + Unit *other; + Iterator i; + void *v; + + /* If there's already a start pending don't bother to + * do anything */ + HASHMAP_FOREACH_KEY(v, other, UNIT(s)->dependencies[UNIT_TRIGGERS], i) + if (unit_active_or_pending(other)) { + pending = true; + break; + } + + if (!pending) { + if (!UNIT_ISSET(s->service)) { + log_unit_error(UNIT(s), "Service to activate vanished, refusing activation."); + r = -ENOENT; + goto fail; + } + + r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT_DEREF(s->service), JOB_REPLACE, &error, NULL); + if (r < 0) + goto fail; + } + + socket_set_state(s, SOCKET_RUNNING); + } else { + _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL; + _cleanup_(socket_peer_unrefp) SocketPeer *p = NULL; + Service *service; + + if (s->n_connections >= s->max_connections) { + log_unit_warning(UNIT(s), "Too many incoming connections (%u), dropping connection.", + s->n_connections); + goto refuse; + } + + if (s->max_connections_per_source > 0) { + r = socket_acquire_peer(s, cfd, &p); + if (r < 0) { + goto refuse; + } else if (r > 0 && p->n_ref > s->max_connections_per_source) { + _cleanup_free_ char *t = NULL; + + (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t); + + log_unit_warning(UNIT(s), + "Too many incoming connections (%u) from source %s, dropping connection.", + p->n_ref, strnull(t)); + goto refuse; + } + } + + r = socket_instantiate_service(s); + if (r < 0) + goto fail; + + r = instance_from_socket(cfd, s->n_accepted, &instance); + if (r < 0) { + if (r != -ENOTCONN) + goto fail; + + /* ENOTCONN is legitimate if TCP RST was received. + * This connection is over, but the socket unit lives on. */ + log_unit_debug(UNIT(s), "Got ENOTCONN on incoming socket, assuming aborted connection attempt, ignoring."); + goto refuse; + } + + r = unit_name_to_prefix(UNIT(s)->id, &prefix); + if (r < 0) + goto fail; + + r = unit_name_build(prefix, instance, ".service", &name); + if (r < 0) + goto fail; + + r = unit_add_name(UNIT_DEREF(s->service), name); + if (r < 0) + goto fail; + + service = SERVICE(UNIT_DEREF(s->service)); + unit_ref_unset(&s->service); + + s->n_accepted++; + unit_choose_id(UNIT(service), name); + + r = service_set_socket_fd(service, cfd, s, s->selinux_context_from_net); + if (r < 0) + goto fail; + + cfd = -1; /* We passed ownership of the fd to the service now. Forget it here. */ + s->n_connections++; + + service->peer = TAKE_PTR(p); /* Pass ownership of the peer reference */ + + r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT(service), JOB_REPLACE, &error, NULL); + if (r < 0) { + /* We failed to activate the new service, but it still exists. Let's make sure the service + * closes and forgets the connection fd again, immediately. */ + service_close_socket_fd(service); + goto fail; + } + + /* Notify clients about changed counters */ + unit_add_to_dbus_queue(UNIT(s)); + } + + return; + +refuse: + s->n_refused++; + safe_close(cfd); + return; + +fail: + log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s", + cfd >= 0 ? "template" : "non-template", + bus_error_message(&error, r)); + + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); + safe_close(cfd); +} + +static void socket_run_next(Socket *s) { + int r; + + assert(s); + assert(s->control_command); + assert(s->control_command->command_next); + + socket_unwatch_control_pid(s); + + s->control_command = s->control_command->command_next; + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run next task: %m"); + + if (s->state == SOCKET_START_POST) + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); + else if (s->state == SOCKET_STOP_POST) + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); + else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES); +} + +static int socket_start(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(s->state, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(s->state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST)) + return 0; + + /* Cannot run this without the service being around */ + if (UNIT_ISSET(s->service)) { + Service *service; + + service = SERVICE(UNIT_DEREF(s->service)); + + if (UNIT(service)->load_state != UNIT_LOADED) { + log_unit_error(u, "Socket service %s not loaded, refusing.", UNIT(service)->id); + return -ENOENT; + } + + /* If the service is already active we cannot start the + * socket */ + if (!IN_SET(service->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { + log_unit_error(u, "Socket service %s already active, refusing.", UNIT(service)->id); + return -EBUSY; + } + } + + assert(IN_SET(s->state, SOCKET_DEAD, SOCKET_FAILED)); + + r = unit_start_limit_test(u); + if (r < 0) { + socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + s->result = SOCKET_SUCCESS; + exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); + + u->reset_accounting = true; + + socket_enter_start_pre(s); + return 1; +} + +static int socket_stop(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + /* Already on it */ + if (IN_SET(s->state, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL)) + return 0; + + /* If there's already something running we go directly into + * kill mode. */ + if (IN_SET(s->state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST)) { + socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_SUCCESS); + return -EAGAIN; + } + + assert(IN_SET(s->state, SOCKET_LISTENING, SOCKET_RUNNING)); + + socket_enter_stop_pre(s, SOCKET_SUCCESS); + return 1; +} + +static int socket_serialize(Unit *u, FILE *f, FDSet *fds) { + Socket *s = SOCKET(u); + SocketPort *p; + int r; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", socket_state_to_string(s->state)); + (void) serialize_item(f, "result", socket_result_to_string(s->result)); + (void) serialize_item_format(f, "n-accepted", "%u", s->n_accepted); + (void) serialize_item_format(f, "n-refused", "%u", s->n_refused); + + if (s->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid); + + if (s->control_command_id >= 0) + (void) serialize_item(f, "control-command", socket_exec_command_to_string(s->control_command_id)); + + LIST_FOREACH(port, p, s->ports) { + int copy; + + if (p->fd < 0) + continue; + + copy = fdset_put_dup(fds, p->fd); + if (copy < 0) + return log_unit_warning_errno(u, copy, "Failed to serialize socket fd: %m"); + + if (p->type == SOCKET_SOCKET) { + _cleanup_free_ char *t = NULL; + + r = socket_address_print(&p->address, &t); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to format socket address: %m"); + + if (socket_address_family(&p->address) == AF_NETLINK) + (void) serialize_item_format(f, "netlink", "%i %s", copy, t); + else + (void) serialize_item_format(f, "socket", "%i %i %s", copy, p->address.type, t); + } else if (p->type == SOCKET_SPECIAL) + (void) serialize_item_format(f, "special", "%i %s", copy, p->path); + else if (p->type == SOCKET_MQUEUE) + (void) serialize_item_format(f, "mqueue", "%i %s", copy, p->path); + else if (p->type == SOCKET_USB_FUNCTION) + (void) serialize_item_format(f, "ffs", "%i %s", copy, p->path); + else { + assert(p->type == SOCKET_FIFO); + (void) serialize_item_format(f, "fifo", "%i %s", copy, p->path); + } + } + + return 0; +} + +static void socket_port_take_fd(SocketPort *p, FDSet *fds, int fd) { + assert(p); + + safe_close(p->fd); + p->fd = fdset_remove(fds, fd); +} + +static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Socket *s = SOCKET(u); + + assert(u); + assert(key); + assert(value); + + if (streq(key, "state")) { + SocketState state; + + state = socket_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + SocketResult f; + + f = socket_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SOCKET_SUCCESS) + s->result = f; + + } else if (streq(key, "n-accepted")) { + unsigned k; + + if (safe_atou(value, &k) < 0) + log_unit_debug(u, "Failed to parse n-accepted value: %s", value); + else + s->n_accepted += k; + } else if (streq(key, "n-refused")) { + unsigned k; + + if (safe_atou(value, &k) < 0) + log_unit_debug(u, "Failed to parse n-refused value: %s", value); + else + s->n_refused += k; + } else if (streq(key, "control-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + else + s->control_pid = pid; + } else if (streq(key, "control-command")) { + SocketExecCommand id; + + id = socket_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + s->control_command_id = id; + s->control_command = s->exec_command[id]; + } + } else if (streq(key, "fifo")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse fifo value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_FIFO && + path_equal_or_files_same(p->path, value+skip, 0)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "special")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse special value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_SPECIAL && + path_equal_or_files_same(p->path, value+skip, 0)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "mqueue")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse mqueue value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_MQUEUE && + streq(p->path, value+skip)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "socket")) { + int fd, type, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %i %n", &fd, &type, &skip) < 2 || fd < 0 || type < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse socket value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (socket_address_is(&p->address, value+skip, type)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "netlink")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse socket value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (socket_address_is_netlink(&p->address, value+skip)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "ffs")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse ffs value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_USB_FUNCTION && + path_equal_or_files_same(p->path, value+skip, 0)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else + log_unit_debug(UNIT(s), "Unknown serialization key: %s", key); + + return 0; +} + +static void socket_distribute_fds(Unit *u, FDSet *fds) { + Socket *s = SOCKET(u); + SocketPort *p; + + assert(u); + + LIST_FOREACH(port, p, s->ports) { + Iterator i; + int fd; + + if (p->type != SOCKET_SOCKET) + continue; + + if (p->fd >= 0) + continue; + + FDSET_FOREACH(fd, fds, i) { + if (socket_address_matches_fd(&p->address, fd)) { + p->fd = fdset_remove(fds, fd); + s->deserialized_state = SOCKET_LISTENING; + break; + } + } + } +} + +_pure_ static UnitActiveState socket_active_state(Unit *u) { + assert(u); + + return state_translation_table[SOCKET(u)->state]; +} + +_pure_ static const char *socket_sub_state_to_string(Unit *u) { + assert(u); + + return socket_state_to_string(SOCKET(u)->state); +} + +const char* socket_port_type_to_string(SocketPort *p) { + + assert(p); + + switch (p->type) { + + case SOCKET_SOCKET: + + switch (p->address.type) { + + case SOCK_STREAM: + return "Stream"; + + case SOCK_DGRAM: + return "Datagram"; + + case SOCK_SEQPACKET: + return "SequentialPacket"; + + case SOCK_RAW: + if (socket_address_family(&p->address) == AF_NETLINK) + return "Netlink"; + + _fallthrough_; + default: + return NULL; + } + + case SOCKET_SPECIAL: + return "Special"; + + case SOCKET_MQUEUE: + return "MessageQueue"; + + case SOCKET_FIFO: + return "FIFO"; + + case SOCKET_USB_FUNCTION: + return "USBFunction"; + + default: + return NULL; + } +} + +SocketType socket_port_type_from_string(const char *s) { + assert(s); + + if (STR_IN_SET(s, "Stream", "Datagram", "SequentialPacket", "Netlink")) + return SOCKET_SOCKET; + else if (streq(s, "Special")) + return SOCKET_SPECIAL; + else if (streq(s, "MessageQueue")) + return SOCKET_MQUEUE; + else if (streq(s, "FIFO")) + return SOCKET_FIFO; + else if (streq(s, "USBFunction")) + return SOCKET_USB_FUNCTION; + else + return _SOCKET_TYPE_INVALID; +} + +_pure_ static bool socket_may_gc(Unit *u) { + Socket *s = SOCKET(u); + + assert(u); + + return s->n_connections == 0; +} + +static int socket_accept_do(Socket *s, int fd) { + int cfd; + + assert(s); + assert(fd >= 0); + + for (;;) { + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK); + if (cfd < 0) { + if (errno == EINTR) + continue; + + return -errno; + } + + break; + } + + return cfd; +} + +static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) { + _cleanup_close_pair_ int pair[2] = { -1, -1 }; + int cfd, r; + pid_t pid; + + assert(s); + assert(p); + assert(fd >= 0); + + /* Similar to socket_address_listen_in_cgroup(), but for accept() rathern than socket(): make sure that any + * connection socket is also properly associated with the cgroup. */ + + if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6)) + goto shortcut; + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == BPF_FIREWALL_UNSUPPORTED) + goto shortcut; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), "(sd-accept)", &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + cfd = socket_accept_do(s, fd); + if (cfd < 0) { + log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], cfd, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + cfd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_check("(sd-accept)", pid, WAIT_LOG_ABNORMAL); + if (r < 0) { + safe_close(cfd); + return r; + } + + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m"); + + return cfd; + +shortcut: + cfd = socket_accept_do(s, fd); + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + + return cfd; +} + +static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + SocketPort *p = userdata; + int cfd = -1; + + assert(p); + assert(fd >= 0); + + if (p->socket->state != SOCKET_LISTENING) + return 0; + + log_unit_debug(UNIT(p->socket), "Incoming traffic"); + + if (revents != EPOLLIN) { + + if (revents & EPOLLHUP) + log_unit_error(UNIT(p->socket), "Got POLLHUP on a listening socket. The service probably invoked shutdown() on it, and should better not do that."); + else + log_unit_error(UNIT(p->socket), "Got unexpected poll event (0x%x) on socket.", revents); + goto fail; + } + + if (p->socket->accept && + p->type == SOCKET_SOCKET && + socket_address_can_accept(&p->address)) { + + cfd = socket_accept_in_cgroup(p->socket, p, fd); + if (cfd < 0) + goto fail; + + socket_apply_socket_options(p->socket, cfd); + } + + socket_enter_running(p->socket, cfd); + return 0; + +fail: + socket_enter_stop_pre(p->socket, SOCKET_FAILURE_RESOURCES); + return 0; +} + +static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Socket *s = SOCKET(u); + SocketResult f; + + assert(s); + assert(pid >= 0); + + if (pid != s->control_pid) + return; + + s->control_pid = 0; + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = SOCKET_SUCCESS; + else if (code == CLD_EXITED) + f = SOCKET_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SOCKET_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SOCKET_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown sigchld code"); + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SOCKET_SUCCESS; + } + + unit_log_process_exit( + u, f == SOCKET_SUCCESS ? LOG_DEBUG : LOG_NOTICE, + "Control process", + socket_exec_command_to_string(s->control_command_id), + code, status); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + if (s->control_command && + s->control_command->command_next && + f == SOCKET_SUCCESS) { + + log_unit_debug(u, "Running next command for state %s", socket_state_to_string(s->state)); + socket_run_next(s); + } else { + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + /* No further commands for this step, so let's figure + * out what to do next */ + + log_unit_debug(u, "Got final SIGCHLD for state %s", socket_state_to_string(s->state)); + + switch (s->state) { + + case SOCKET_START_PRE: + if (f == SOCKET_SUCCESS) + socket_enter_start_chown(s); + else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, f); + break; + + case SOCKET_START_CHOWN: + if (f == SOCKET_SUCCESS) + socket_enter_start_post(s); + else + socket_enter_stop_pre(s, f); + break; + + case SOCKET_START_POST: + if (f == SOCKET_SUCCESS) + socket_enter_listening(s); + else + socket_enter_stop_pre(s, f); + break; + + case SOCKET_STOP_PRE: + case SOCKET_STOP_PRE_SIGTERM: + case SOCKET_STOP_PRE_SIGKILL: + socket_enter_stop_post(s, f); + break; + + case SOCKET_STOP_POST: + case SOCKET_FINAL_SIGTERM: + case SOCKET_FINAL_SIGKILL: + socket_enter_dead(s, f); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Socket *s = SOCKET(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SOCKET_START_PRE: + log_unit_warning(UNIT(s), "Starting timed out. Terminating."); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_START_CHOWN: + case SOCKET_START_POST: + log_unit_warning(UNIT(s), "Starting timed out. Stopping."); + socket_enter_stop_pre(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_PRE: + log_unit_warning(UNIT(s), "Stopping timed out. Terminating."); + socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_PRE_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL. Ignoring."); + socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT); + } + break; + + case SOCKET_STOP_PRE_SIGKILL: + log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring."); + socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_POST: + log_unit_warning(UNIT(s), "Stopping timed out (2). Terminating."); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_FINAL_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out (2). Killing."); + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out (2). Skipping SIGKILL. Ignoring."); + socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT); + } + break; + + case SOCKET_FINAL_SIGKILL: + log_unit_warning(UNIT(s), "Still around after SIGKILL (2). Entering failed mode."); + socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +int socket_collect_fds(Socket *s, int **fds) { + size_t k = 0, n = 0; + SocketPort *p; + int *rfds; + + assert(s); + assert(fds); + + /* Called from the service code for requesting our fds */ + + LIST_FOREACH(port, p, s->ports) { + if (p->fd >= 0) + n++; + n += p->n_auxiliary_fds; + } + + if (n <= 0) { + *fds = NULL; + return 0; + } + + rfds = new(int, n); + if (!rfds) + return -ENOMEM; + + LIST_FOREACH(port, p, s->ports) { + size_t i; + + if (p->fd >= 0) + rfds[k++] = p->fd; + for (i = 0; i < p->n_auxiliary_fds; ++i) + rfds[k++] = p->auxiliary_fds[i]; + } + + assert(k == n); + + *fds = rfds; + return (int) n; +} + +static void socket_reset_failed(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + if (s->state == SOCKET_FAILED) + socket_set_state(s, SOCKET_DEAD); + + s->result = SOCKET_SUCCESS; +} + +void socket_connection_unref(Socket *s) { + assert(s); + + /* The service is dead. Yay! + * + * This is strictly for one-instance-per-connection + * services. */ + + assert(s->n_connections > 0); + s->n_connections--; + + log_unit_debug(UNIT(s), "One connection closed, %u left.", s->n_connections); +} + +static void socket_trigger_notify(Unit *u, Unit *other) { + Socket *s = SOCKET(u); + + assert(u); + assert(other); + + /* Filter out invocations with bogus state */ + if (other->load_state != UNIT_LOADED || other->type != UNIT_SERVICE) + return; + + /* Don't propagate state changes from the service if we are already down */ + if (!IN_SET(s->state, SOCKET_RUNNING, SOCKET_LISTENING)) + return; + + /* We don't care for the service state if we are in Accept=yes mode */ + if (s->accept) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + socket_enter_stop_pre(s, SOCKET_FAILURE_SERVICE_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + if (IN_SET(SERVICE(other)->state, + SERVICE_DEAD, SERVICE_FAILED, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_AUTO_RESTART)) + socket_enter_listening(s); + + if (SERVICE(other)->state == SERVICE_RUNNING) + socket_set_state(s, SOCKET_RUNNING); +} + +static int socket_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, SOCKET(u)->control_pid, error); +} + +static int socket_get_timeout(Unit *u, usec_t *timeout) { + Socket *s = SOCKET(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +char *socket_fdname(Socket *s) { + assert(s); + + /* Returns the name to use for $LISTEN_NAMES. If the user + * didn't specify anything specifically, use the socket unit's + * name as fallback. */ + + return s->fdname ?: UNIT(s)->id; +} + +static int socket_control_pid(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + return s->control_pid; +} + +static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = { + [SOCKET_EXEC_START_PRE] = "ExecStartPre", + [SOCKET_EXEC_START_CHOWN] = "ExecStartChown", + [SOCKET_EXEC_START_POST] = "ExecStartPost", + [SOCKET_EXEC_STOP_PRE] = "ExecStopPre", + [SOCKET_EXEC_STOP_POST] = "ExecStopPost" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand); + +static const char* const socket_result_table[_SOCKET_RESULT_MAX] = { + [SOCKET_SUCCESS] = "success", + [SOCKET_FAILURE_RESOURCES] = "resources", + [SOCKET_FAILURE_TIMEOUT] = "timeout", + [SOCKET_FAILURE_EXIT_CODE] = "exit-code", + [SOCKET_FAILURE_SIGNAL] = "signal", + [SOCKET_FAILURE_CORE_DUMP] = "core-dump", + [SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit", + [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult); + +const UnitVTable socket_vtable = { + .object_size = sizeof(Socket), + .exec_context_offset = offsetof(Socket, exec_context), + .cgroup_context_offset = offsetof(Socket, cgroup_context), + .kill_context_offset = offsetof(Socket, kill_context), + .exec_runtime_offset = offsetof(Socket, exec_runtime), + .dynamic_creds_offset = offsetof(Socket, dynamic_creds), + + .sections = + "Unit\0" + "Socket\0" + "Install\0", + .private_section = "Socket", + + .can_transient = true, + + .init = socket_init, + .done = socket_done, + .load = socket_load, + + .coldplug = socket_coldplug, + + .dump = socket_dump, + + .start = socket_start, + .stop = socket_stop, + + .kill = socket_kill, + + .get_timeout = socket_get_timeout, + + .serialize = socket_serialize, + .deserialize_item = socket_deserialize_item, + .distribute_fds = socket_distribute_fds, + + .active_state = socket_active_state, + .sub_state_to_string = socket_sub_state_to_string, + + .may_gc = socket_may_gc, + + .sigchld_event = socket_sigchld_event, + + .trigger_notify = socket_trigger_notify, + + .reset_failed = socket_reset_failed, + + .control_pid = socket_control_pid, + + .bus_vtable = bus_socket_vtable, + .bus_set_property = bus_socket_set_property, + .bus_commit_properties = bus_socket_commit_properties, + + .status_message_formats = { + /*.starting_stopping = { + [0] = "Starting socket %s...", + [1] = "Stopping socket %s...", + },*/ + .finished_start_job = { + [JOB_DONE] = "Listening on %s.", + [JOB_FAILED] = "Failed to listen on %s.", + [JOB_TIMEOUT] = "Timed out starting %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Closed %s.", + [JOB_FAILED] = "Failed stopping %s.", + [JOB_TIMEOUT] = "Timed out stopping %s.", + }, + }, +}; diff --git a/src/core/socket.h b/src/core/socket.h new file mode 100644 index 0000000..c4e25db --- /dev/null +++ b/src/core/socket.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Socket Socket; +typedef struct SocketPeer SocketPeer; + +#include "mount.h" +#include "service.h" +#include "socket-util.h" +#include "unit.h" + +typedef enum SocketExecCommand { + SOCKET_EXEC_START_PRE, + SOCKET_EXEC_START_CHOWN, + SOCKET_EXEC_START_POST, + SOCKET_EXEC_STOP_PRE, + SOCKET_EXEC_STOP_POST, + _SOCKET_EXEC_COMMAND_MAX, + _SOCKET_EXEC_COMMAND_INVALID = -1 +} SocketExecCommand; + +typedef enum SocketType { + SOCKET_SOCKET, + SOCKET_FIFO, + SOCKET_SPECIAL, + SOCKET_MQUEUE, + SOCKET_USB_FUNCTION, + _SOCKET_TYPE_MAX, + _SOCKET_TYPE_INVALID = -1 +} SocketType; + +typedef enum SocketResult { + SOCKET_SUCCESS, + SOCKET_FAILURE_RESOURCES, + SOCKET_FAILURE_TIMEOUT, + SOCKET_FAILURE_EXIT_CODE, + SOCKET_FAILURE_SIGNAL, + SOCKET_FAILURE_CORE_DUMP, + SOCKET_FAILURE_START_LIMIT_HIT, + SOCKET_FAILURE_TRIGGER_LIMIT_HIT, + SOCKET_FAILURE_SERVICE_START_LIMIT_HIT, + _SOCKET_RESULT_MAX, + _SOCKET_RESULT_INVALID = -1 +} SocketResult; + +typedef struct SocketPort { + Socket *socket; + + SocketType type; + int fd; + int *auxiliary_fds; + size_t n_auxiliary_fds; + + SocketAddress address; + char *path; + sd_event_source *event_source; + + LIST_FIELDS(struct SocketPort, port); +} SocketPort; + +struct Socket { + Unit meta; + + LIST_HEAD(SocketPort, ports); + + Set *peers_by_address; + + unsigned n_accepted; + unsigned n_connections; + unsigned n_refused; + unsigned max_connections; + unsigned max_connections_per_source; + + unsigned backlog; + unsigned keep_alive_cnt; + usec_t timeout_usec; + usec_t keep_alive_time; + usec_t keep_alive_interval; + usec_t defer_accept; + + ExecCommand* exec_command[_SOCKET_EXEC_COMMAND_MAX]; + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + /* For Accept=no sockets refers to the one service we'll + * activate. For Accept=yes sockets is either NULL, or filled + * to refer to the next service we spawn. */ + UnitRef service; + + SocketState state, deserialized_state; + + sd_event_source *timer_event_source; + + ExecCommand* control_command; + SocketExecCommand control_command_id; + pid_t control_pid; + + mode_t directory_mode; + mode_t socket_mode; + + SocketResult result; + + char **symlinks; + + bool accept; + bool remove_on_stop; + bool writable; + + int socket_protocol; + + /* Socket options */ + bool keep_alive; + bool no_delay; + bool free_bind; + bool transparent; + bool broadcast; + bool pass_cred; + bool pass_sec; + + /* Only for INET6 sockets: issue IPV6_V6ONLY sockopt */ + SocketAddressBindIPv6Only bind_ipv6_only; + + int priority; + int mark; + size_t receive_buffer; + size_t send_buffer; + int ip_tos; + int ip_ttl; + size_t pipe_size; + char *bind_to_device; + char *tcp_congestion; + bool reuse_port; + long mq_maxmsg; + long mq_msgsize; + + char *smack; + char *smack_ip_in; + char *smack_ip_out; + + bool selinux_context_from_net; + + char *user, *group; + + char *fdname; + + RateLimit trigger_limit; +}; + +SocketPeer *socket_peer_ref(SocketPeer *p); +SocketPeer *socket_peer_unref(SocketPeer *p); +int socket_acquire_peer(Socket *s, int fd, SocketPeer **p); + +DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref); + +/* Called from the service code when collecting fds */ +int socket_collect_fds(Socket *s, int **fds); + +/* Called from the service code when a per-connection service ended */ +void socket_connection_unref(Socket *s); + +void socket_free_ports(Socket *s); + +int socket_instantiate_service(Socket *s); + +char *socket_fdname(Socket *s); + +extern const UnitVTable socket_vtable; + +const char* socket_exec_command_to_string(SocketExecCommand i) _const_; +SocketExecCommand socket_exec_command_from_string(const char *s) _pure_; + +const char* socket_result_to_string(SocketResult i) _const_; +SocketResult socket_result_from_string(const char *s) _pure_; + +const char* socket_port_type_to_string(SocketPort *p) _pure_; +SocketType socket_port_type_from_string(const char *p) _pure_; + +DEFINE_CAST(SOCKET, Socket); diff --git a/src/core/swap.c b/src/core/swap.c new file mode 100644 index 0000000..2d8463b --- /dev/null +++ b/src/core/swap.c @@ -0,0 +1,1555 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <sys/epoll.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "sd-device.h" + +#include "alloc-util.h" +#include "dbus-swap.h" +#include "dbus-unit.h" +#include "device-private.h" +#include "device-util.h" +#include "device.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "fstab-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "swap.h" +#include "unit-name.h" +#include "unit.h" +#include "virt.h" + +static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = { + [SWAP_DEAD] = UNIT_INACTIVE, + [SWAP_ACTIVATING] = UNIT_ACTIVATING, + [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE, + [SWAP_ACTIVE] = UNIT_ACTIVE, + [SWAP_DEACTIVATING] = UNIT_DEACTIVATING, + [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING, + [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING, + [SWAP_FAILED] = UNIT_FAILED +}; + +static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); + +static bool SWAP_STATE_WITH_PROCESS(SwapState state) { + return IN_SET(state, + SWAP_ACTIVATING, + SWAP_ACTIVATING_DONE, + SWAP_DEACTIVATING, + SWAP_DEACTIVATING_SIGTERM, + SWAP_DEACTIVATING_SIGKILL); +} + +static void swap_unset_proc_swaps(Swap *s) { + assert(s); + + if (!s->from_proc_swaps) + return; + + s->parameters_proc_swaps.what = mfree(s->parameters_proc_swaps.what); + + s->from_proc_swaps = false; +} + +static int swap_set_devnode(Swap *s, const char *devnode) { + Hashmap *swaps; + Swap *first; + int r; + + assert(s); + + r = hashmap_ensure_allocated(&UNIT(s)->manager->swaps_by_devnode, &path_hash_ops); + if (r < 0) + return r; + + swaps = UNIT(s)->manager->swaps_by_devnode; + + if (s->devnode) { + first = hashmap_get(swaps, s->devnode); + + LIST_REMOVE(same_devnode, first, s); + if (first) + hashmap_replace(swaps, first->devnode, first); + else + hashmap_remove(swaps, s->devnode); + + s->devnode = mfree(s->devnode); + } + + if (devnode) { + s->devnode = strdup(devnode); + if (!s->devnode) + return -ENOMEM; + + first = hashmap_get(swaps, s->devnode); + LIST_PREPEND(same_devnode, first, s); + + return hashmap_replace(swaps, first->devnode, first); + } + + return 0; +} + +static void swap_init(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + assert(UNIT(s)->load_state == UNIT_STUB); + + s->timeout_usec = u->manager->default_timeout_start_usec; + + s->exec_context.std_output = u->manager->default_std_output; + s->exec_context.std_error = u->manager->default_std_error; + + s->parameters_proc_swaps.priority = s->parameters_fragment.priority = -1; + + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + + u->ignore_on_isolate = true; +} + +static void swap_unwatch_control_pid(Swap *s) { + assert(s); + + if (s->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->control_pid); + s->control_pid = 0; +} + +static void swap_done(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + swap_unset_proc_swaps(s); + swap_set_devnode(s, NULL); + + s->what = mfree(s->what); + s->parameters_fragment.what = mfree(s->parameters_fragment.what); + s->parameters_fragment.options = mfree(s->parameters_fragment.options); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, false); + exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); + s->control_command = NULL; + + dynamic_creds_unref(&s->dynamic_creds); + + swap_unwatch_control_pid(s); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); +} + +static int swap_arm_timer(Swap *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + swap_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "swap-timer"); + + return 0; +} + +static int swap_add_device_dependencies(Swap *s) { + assert(s); + + if (!s->what) + return 0; + + if (!s->from_fragment) + return 0; + + if (is_device_path(s->what)) + return unit_add_node_dependency(UNIT(s), s->what, MANAGER_IS_SYSTEM(UNIT(s)->manager), UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE); + else + /* File based swap devices need to be ordered after + * systemd-remount-fs.service, since they might need a + * writable file system. */ + return unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, UNIT_DEPENDENCY_FILE); +} + +static int swap_add_default_dependencies(Swap *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + if (!MANAGER_IS_SYSTEM(UNIT(s)->manager)) + return 0; + + if (detect_container() > 0) + return 0; + + /* swap units generated for the swap dev links are missing the + * ordering dep against the swap target. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SWAP_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int swap_verify(Swap *s) { + _cleanup_free_ char *e = NULL; + int r; + + if (UNIT(s)->load_state != UNIT_LOADED) + return 0; + + r = unit_name_from_path(s->what, ".swap", &e); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to generate unit name from path: %m"); + + if (!unit_has_name(UNIT(s), e)) { + log_unit_error(UNIT(s), "Value of What= and unit name do not match, not loading."); + return -ENOEXEC; + } + + if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP) { + log_unit_error(UNIT(s), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing to load."); + return -ENOEXEC; + } + + return 0; +} + +static int swap_load_devnode(Swap *s) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + struct stat st; + const char *p; + int r; + + assert(s); + + if (stat(s->what, &st) < 0 || !S_ISBLK(st.st_mode)) + return 0; + + r = device_new_from_stat_rdev(&d, &st); + if (r < 0) { + log_unit_full(UNIT(s), r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to allocate device for swap %s: %m", s->what); + return 0; + } + + if (sd_device_get_devname(d, &p) < 0) + return 0; + + return swap_set_devnode(s, p); +} + +static int swap_add_extras(Swap *s) { + int r; + + assert(s); + + if (UNIT(s)->fragment_path) + s->from_fragment = true; + + if (!s->what) { + if (s->parameters_fragment.what) + s->what = strdup(s->parameters_fragment.what); + else if (s->parameters_proc_swaps.what) + s->what = strdup(s->parameters_proc_swaps.what); + else { + r = unit_name_to_path(UNIT(s)->id, &s->what); + if (r < 0) + return r; + } + + if (!s->what) + return -ENOMEM; + } + + path_simplify(s->what, false); + + if (!UNIT(s)->description) { + r = unit_set_description(UNIT(s), s->what); + if (r < 0) + return r; + } + + r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + + r = swap_add_device_dependencies(s); + if (r < 0) + return r; + + r = swap_load_devnode(s); + if (r < 0) + return r; + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(UNIT(s), &s->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + r = swap_add_default_dependencies(s); + if (r < 0) + return r; + + return 0; +} + +static int swap_load(Unit *u) { + Swap *s = SWAP(u); + int r, q; + + assert(s); + assert(u->load_state == UNIT_STUB); + + /* Load a .swap file */ + if (SWAP(u)->from_proc_swaps) + r = unit_load_fragment_and_dropin_optional(u); + else + r = unit_load_fragment_and_dropin(u); + + /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is already + * active. */ + if (u->load_state == UNIT_LOADED || s->from_proc_swaps) + q = swap_add_extras(s); + else + q = 0; + + if (r < 0) + return r; + if (q < 0) + return q; + + return swap_verify(s); +} + +static int swap_setup_unit( + Manager *m, + const char *what, + const char *what_proc_swaps, + int priority, + bool set_flags) { + + _cleanup_free_ char *e = NULL; + bool delete = false; + Unit *u = NULL; + int r; + SwapParameters *p; + + assert(m); + assert(what); + assert(what_proc_swaps); + + r = unit_name_from_path(what, ".swap", &e); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m"); + + u = manager_get_unit(m, e); + if (u && + SWAP(u)->from_proc_swaps && + !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "Swap %s appeared twice with different device paths %s and %s", + e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps); + + if (!u) { + delete = true; + + r = unit_new_for_name(m, sizeof(Swap), e, &u); + if (r < 0) + goto fail; + + SWAP(u)->what = strdup(what); + if (!SWAP(u)->what) { + r = -ENOMEM; + goto fail; + } + + unit_add_to_load_queue(u); + } else + delete = false; + + p = &SWAP(u)->parameters_proc_swaps; + + if (!p->what) { + p->what = strdup(what_proc_swaps); + if (!p->what) { + r = -ENOMEM; + goto fail; + } + } + + /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be + * loaded. After all we can load it now, from the data in /proc/swaps. */ + if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + u->load_state = UNIT_LOADED; + u->load_error = 0; + } + + if (set_flags) { + SWAP(u)->is_active = true; + SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps; + } + + SWAP(u)->from_proc_swaps = true; + + p->priority = priority; + + unit_add_to_dbus_queue(u); + return 0; + +fail: + log_unit_warning_errno(u, r, "Failed to load swap unit: %m"); + + if (delete) + unit_free(u); + + return r; +} + +static int swap_process_new(Manager *m, const char *device, int prio, bool set_flags) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + const char *dn, *devlink; + struct stat st, st_link; + int r; + + assert(m); + + r = swap_setup_unit(m, device, device, prio, set_flags); + if (r < 0) + return r; + + /* If this is a block device, then let's add duplicates for + * all other names of this block device */ + if (stat(device, &st) < 0 || !S_ISBLK(st.st_mode)) + return 0; + + r = device_new_from_stat_rdev(&d, &st); + if (r < 0) { + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to allocate device for swap %s: %m", device); + return 0; + } + + /* Add the main device node */ + if (sd_device_get_devname(d, &dn) >= 0 && !streq(dn, device)) + swap_setup_unit(m, dn, device, prio, set_flags); + + /* Add additional units for all symlinks */ + FOREACH_DEVICE_DEVLINK(d, devlink) { + + /* Don't bother with the /dev/block links */ + if (streq(devlink, device)) + continue; + + if (path_startswith(devlink, "/dev/block/")) + continue; + + if (stat(devlink, &st_link) >= 0 && + (!S_ISBLK(st_link.st_mode) || + st_link.st_rdev != st.st_rdev)) + continue; + + swap_setup_unit(m, devlink, device, prio, set_flags); + } + + return 0; +} + +static void swap_set_state(Swap *s, SwapState state) { + SwapState old_state; + Swap *other; + + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!SWAP_STATE_WITH_PROCESS(state)) { + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + swap_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + } + + if (state != old_state) + log_unit_debug(UNIT(s), "Changed %s -> %s", swap_state_to_string(old_state), swap_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0); + + /* If there other units for the same device node have a job + queued it might be worth checking again if it is runnable + now. This is necessary, since swap_start() refuses + operation with EAGAIN if there's already another job for + the same device node queued. */ + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job) + job_add_to_run_queue(UNIT(other)->job); +} + +static int swap_coldplug(Unit *u) { + Swap *s = SWAP(u); + SwapState new_state = SWAP_DEAD; + int r; + + assert(s); + assert(s->state == SWAP_DEAD); + + if (s->deserialized_state != s->state) + new_state = s->deserialized_state; + else if (s->from_proc_swaps) + new_state = SWAP_ACTIVE; + + if (new_state == s->state) + return 0; + + if (s->control_pid > 0 && + pid_is_unwaited(s->control_pid) && + SWAP_STATE_WITH_PROCESS(new_state)) { + + r = unit_watch_pid(UNIT(s), s->control_pid); + if (r < 0) + return r; + + r = swap_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) { + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + swap_set_state(s, new_state); + return 0; +} + +static void swap_dump(Unit *u, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESPAN_MAX]; + Swap *s = SWAP(u); + SwapParameters *p; + + assert(s); + assert(f); + + if (s->from_proc_swaps) + p = &s->parameters_proc_swaps; + else if (s->from_fragment) + p = &s->parameters_fragment; + else + p = NULL; + + fprintf(f, + "%sSwap State: %s\n" + "%sResult: %s\n" + "%sWhat: %s\n" + "%sFrom /proc/swaps: %s\n" + "%sFrom fragment: %s\n", + prefix, swap_state_to_string(s->state), + prefix, swap_result_to_string(s->result), + prefix, s->what, + prefix, yes_no(s->from_proc_swaps), + prefix, yes_no(s->from_fragment)); + + if (s->devnode) + fprintf(f, "%sDevice Node: %s\n", prefix, s->devnode); + + if (p) + fprintf(f, + "%sPriority: %i\n" + "%sOptions: %s\n", + prefix, p->priority, + prefix, strempty(p->options)); + + fprintf(f, + "%sTimeoutSec: %s\n", + prefix, format_timespan(buf, sizeof(buf), s->timeout_usec, USEC_PER_SEC)); + + if (s->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid); + + exec_context_dump(&s->exec_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); + cgroup_context_dump(&s->cgroup_context, f, prefix); +} + +static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(s)); + if (r < 0) + return r; + + r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + goto fail; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->dynamic_creds, + &pid); + if (r < 0) + goto fail; + + r = unit_watch_pid(UNIT(s), pid); + if (r < 0) + /* FIXME: we need to do something here */ + goto fail; + + *_pid = pid; + + return 0; + +fail: + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + return r; +} + +static void swap_enter_dead(Swap *s, SwapResult f) { + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SWAP_SUCCESS, swap_result_to_string(s->result)); + swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, true); + + exec_context_destroy_runtime_directory(&s->exec_context, UNIT(s)->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + unit_unref_uid_gid(UNIT(s), true); + + dynamic_creds_destroy(&s->dynamic_creds); +} + +static void swap_enter_active(Swap *s, SwapResult f) { + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + swap_set_state(s, SWAP_ACTIVE); +} + +static void swap_enter_dead_or_active(Swap *s, SwapResult f) { + assert(s); + + if (s->from_proc_swaps) + swap_enter_active(s, f); + else + swap_enter_dead(s, f); +} + +static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) { + int r; + KillOperation kop; + + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + if (state == SWAP_DEACTIVATING_SIGTERM) + kop = KILL_TERMINATE; + else + kop = KILL_KILL; + + r = unit_kill_context(UNIT(s), &s->kill_context, kop, -1, s->control_pid, false); + if (r < 0) + goto fail; + + if (r > 0) { + r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + swap_set_state(s, state); + } else if (state == SWAP_DEACTIVATING_SIGTERM && s->kill_context.send_sigkill) + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS); + else + swap_enter_dead_or_active(s, SWAP_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_enter_activating(Swap *s) { + _cleanup_free_ char *opts = NULL; + int r; + + assert(s); + + unit_warn_leftover_processes(UNIT(s)); + + s->control_command_id = SWAP_EXEC_ACTIVATE; + s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE; + + if (s->from_fragment) { + int priority = -1; + + r = fstab_find_pri(s->parameters_fragment.options, &priority); + if (r < 0) + log_warning_errno(r, "Failed to parse swap priority \"%s\", ignoring: %m", s->parameters_fragment.options); + else if (r == 1 && s->parameters_fragment.priority >= 0) + log_warning("Duplicate swap priority configuration by Priority and Options fields."); + + if (r <= 0 && s->parameters_fragment.priority >= 0) { + if (s->parameters_fragment.options) + r = asprintf(&opts, "%s,pri=%i", s->parameters_fragment.options, s->parameters_fragment.priority); + else + r = asprintf(&opts, "pri=%i", s->parameters_fragment.priority); + if (r < 0) + goto fail; + } + } + + r = exec_command_set(s->control_command, "/sbin/swapon", NULL); + if (r < 0) + goto fail; + + if (s->parameters_fragment.options || opts) { + r = exec_command_append(s->control_command, "-o", + opts ? : s->parameters_fragment.options, NULL); + if (r < 0) + goto fail; + } + + r = exec_command_append(s->control_command, s->what, NULL); + if (r < 0) + goto fail; + + swap_unwatch_control_pid(s); + + r = swap_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + swap_set_state(s, SWAP_ACTIVATING); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'swapon' task: %m"); + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_enter_deactivating(Swap *s) { + int r; + + assert(s); + + s->control_command_id = SWAP_EXEC_DEACTIVATE; + s->control_command = s->exec_command + SWAP_EXEC_DEACTIVATE; + + r = exec_command_set(s->control_command, + "/sbin/swapoff", + s->what, + NULL); + if (r < 0) + goto fail; + + swap_unwatch_control_pid(s); + + r = swap_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + swap_set_state(s, SWAP_DEACTIVATING); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'swapoff' task: %m"); + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_cycle_clear(Swap *s) { + assert(s); + + s->result = SWAP_SUCCESS; + exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); + UNIT(s)->reset_accounting = true; +} + +static int swap_start(Unit *u) { + Swap *s = SWAP(u), *other; + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later please! */ + if (IN_SET(s->state, + SWAP_DEACTIVATING, + SWAP_DEACTIVATING_SIGTERM, + SWAP_DEACTIVATING_SIGKILL)) + return -EAGAIN; + + /* Already on it! */ + if (s->state == SWAP_ACTIVATING) + return 0; + + assert(IN_SET(s->state, SWAP_DEAD, SWAP_FAILED)); + + if (detect_container() > 0) + return -EPERM; + + /* If there's a job for another swap unit for the same node + * running, then let's not dispatch this one for now, and wait + * until that other job has finished. */ + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job && UNIT(other)->job->state == JOB_RUNNING) + return -EAGAIN; + + r = unit_start_limit_test(u); + if (r < 0) { + swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + swap_cycle_clear(s); + swap_enter_activating(s); + return 1; +} + +static int swap_stop(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + switch (s->state) { + + case SWAP_DEACTIVATING: + case SWAP_DEACTIVATING_SIGTERM: + case SWAP_DEACTIVATING_SIGKILL: + /* Already on it */ + return 0; + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + /* There's a control process pending, directly enter kill mode */ + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_SUCCESS); + return 0; + + case SWAP_ACTIVE: + if (detect_container() > 0) + return -EPERM; + + swap_enter_deactivating(s); + return 1; + + default: + assert_not_reached("Unexpected state."); + } +} + +static int swap_serialize(Unit *u, FILE *f, FDSet *fds) { + Swap *s = SWAP(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", swap_state_to_string(s->state)); + (void) serialize_item(f, "result", swap_result_to_string(s->result)); + + if (s->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid); + + if (s->control_command_id >= 0) + (void) serialize_item(f, "control-command", swap_exec_command_to_string(s->control_command_id)); + + return 0; +} + +static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Swap *s = SWAP(u); + + assert(s); + assert(fds); + + if (streq(key, "state")) { + SwapState state; + + state = swap_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + SwapResult f; + + f = swap_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SWAP_SUCCESS) + s->result = f; + } else if (streq(key, "control-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + else + s->control_pid = pid; + + } else if (streq(key, "control-command")) { + SwapExecCommand id; + + id = swap_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + s->control_command_id = id; + s->control_command = s->exec_command + id; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState swap_active_state(Unit *u) { + assert(u); + + return state_translation_table[SWAP(u)->state]; +} + +_pure_ static const char *swap_sub_state_to_string(Unit *u) { + assert(u); + + return swap_state_to_string(SWAP(u)->state); +} + +_pure_ static bool swap_may_gc(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + if (s->from_proc_swaps) + return false; + + return true; +} + +static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Swap *s = SWAP(u); + SwapResult f; + + assert(s); + assert(pid >= 0); + + if (pid != s->control_pid) + return; + + s->control_pid = 0; + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = SWAP_SUCCESS; + else if (code == CLD_EXITED) + f = SWAP_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SWAP_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SWAP_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown code"); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + } + + unit_log_process_exit( + u, f == SWAP_SUCCESS ? LOG_DEBUG : LOG_NOTICE, + "Swap process", + swap_exec_command_to_string(s->control_command_id), + code, status); + + switch (s->state) { + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + + if (f == SWAP_SUCCESS || s->from_proc_swaps) + swap_enter_active(s, f); + else + swap_enter_dead(s, f); + break; + + case SWAP_DEACTIVATING: + case SWAP_DEACTIVATING_SIGKILL: + case SWAP_DEACTIVATING_SIGTERM: + + swap_enter_dead_or_active(s, f); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Swap *s = SWAP(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + log_unit_warning(UNIT(s), "Activation timed out. Stopping."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_DEACTIVATING: + log_unit_warning(UNIT(s), "Deactivation timed out. Stopping."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_DEACTIVATING_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Swap process timed out. Killing."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Swap process timed out. Skipping SIGKILL. Ignoring."); + swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT); + } + break; + + case SWAP_DEACTIVATING_SIGKILL: + log_unit_warning(UNIT(s), "Swap process still around after SIGKILL. Ignoring."); + swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +static int swap_load_proc_swaps(Manager *m, bool set_flags) { + unsigned i; + + assert(m); + + rewind(m->proc_swaps); + + (void) fscanf(m->proc_swaps, "%*s %*s %*s %*s %*s\n"); + + for (i = 1;; i++) { + _cleanup_free_ char *dev = NULL, *d = NULL; + int prio = 0, k; + + k = fscanf(m->proc_swaps, + "%ms " /* device/file */ + "%*s " /* type of swap */ + "%*s " /* swap size */ + "%*s " /* used */ + "%i\n", /* priority */ + &dev, &prio); + if (k != 2) { + if (k == EOF) + break; + + log_warning("Failed to parse /proc/swaps:%u.", i); + continue; + } + + if (cunescape(dev, UNESCAPE_RELAX, &d) < 0) + return log_oom(); + + device_found_node(m, d, DEVICE_FOUND_SWAP, DEVICE_FOUND_SWAP); + + (void) swap_process_new(m, d, prio, set_flags); + } + + return 0; +} + +static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + Unit *u; + int r; + + assert(m); + assert(revents & EPOLLPRI); + + r = swap_load_proc_swaps(m, true); + if (r < 0) { + log_error_errno(r, "Failed to reread /proc/swaps: %m"); + + /* Reset flags, just in case, for late calls */ + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) { + Swap *swap = SWAP(u); + + swap->is_active = swap->just_activated = false; + } + + return 0; + } + + manager_dispatch_load_queue(m); + + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) { + Swap *swap = SWAP(u); + + if (!swap->is_active) { + + swap_unset_proc_swaps(swap); + + switch (swap->state) { + + case SWAP_ACTIVE: + /* This has just been deactivated */ + swap_enter_dead(swap, SWAP_SUCCESS); + break; + + default: + /* Fire again */ + swap_set_state(swap, swap->state); + break; + } + + if (swap->what) + device_found_node(m, swap->what, 0, DEVICE_FOUND_SWAP); + + } else if (swap->just_activated) { + + /* New swap entry */ + + switch (swap->state) { + + case SWAP_DEAD: + case SWAP_FAILED: + (void) unit_acquire_invocation_id(u); + swap_cycle_clear(swap); + swap_enter_active(swap, SWAP_SUCCESS); + break; + + case SWAP_ACTIVATING: + swap_set_state(swap, SWAP_ACTIVATING_DONE); + break; + + default: + /* Nothing really changed, but let's + * issue an notification call + * nonetheless, in case somebody is + * waiting for this. */ + swap_set_state(swap, swap->state); + break; + } + } + + /* Reset the flags for later calls */ + swap->is_active = swap->just_activated = false; + } + + return 1; +} + +static Unit *swap_following(Unit *u) { + Swap *s = SWAP(u); + Swap *other, *first = NULL; + + assert(s); + + /* If the user configured the swap through /etc/fstab or + * a device unit, follow that. */ + + if (s->from_fragment) + return NULL; + + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (other->from_fragment) + return UNIT(other); + + /* Otherwise, make everybody follow the unit that's named after + * the swap device in the kernel */ + + if (streq_ptr(s->what, s->devnode)) + return NULL; + + LIST_FOREACH_AFTER(same_devnode, other, s) + if (streq_ptr(other->what, other->devnode)) + return UNIT(other); + + LIST_FOREACH_BEFORE(same_devnode, other, s) { + if (streq_ptr(other->what, other->devnode)) + return UNIT(other); + + first = other; + } + + /* Fall back to the first on the list */ + return UNIT(first); +} + +static int swap_following_set(Unit *u, Set **_set) { + Swap *s = SWAP(u), *other; + _cleanup_set_free_ Set *set = NULL; + int r; + + assert(s); + assert(_set); + + if (LIST_JUST_US(same_devnode, s)) { + *_set = NULL; + return 0; + } + + set = set_new(NULL); + if (!set) + return -ENOMEM; + + LIST_FOREACH_OTHERS(same_devnode, other, s) { + r = set_put(set, other); + if (r < 0) + return r; + } + + *_set = TAKE_PTR(set); + return 1; +} + +static void swap_shutdown(Manager *m) { + assert(m); + + m->swap_event_source = sd_event_source_unref(m->swap_event_source); + m->proc_swaps = safe_fclose(m->proc_swaps); + m->swaps_by_devnode = hashmap_free(m->swaps_by_devnode); +} + +static void swap_enumerate(Manager *m) { + int r; + + assert(m); + + if (!m->proc_swaps) { + m->proc_swaps = fopen("/proc/swaps", "re"); + if (!m->proc_swaps) { + if (errno == ENOENT) + log_debug_errno(errno, "Not swap enabled, skipping enumeration."); + else + log_warning_errno(errno, "Failed to open /proc/swaps, ignoring: %m"); + + return; + } + + r = sd_event_add_io(m->event, &m->swap_event_source, fileno(m->proc_swaps), EPOLLPRI, swap_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to watch /proc/swaps: %m"); + goto fail; + } + + /* Dispatch this before we dispatch SIGCHLD, so that + * we always get the events from /proc/swaps before + * the SIGCHLD of /sbin/swapon. */ + r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) { + log_error_errno(r, "Failed to change /proc/swaps priority: %m"); + goto fail; + } + + (void) sd_event_source_set_description(m->swap_event_source, "swap-proc"); + } + + r = swap_load_proc_swaps(m, false); + if (r < 0) + goto fail; + + return; + +fail: + swap_shutdown(m); +} + +int swap_process_device_new(Manager *m, sd_device *dev) { + _cleanup_free_ char *e = NULL; + const char *dn, *devlink; + Unit *u; + int r = 0; + + assert(m); + assert(dev); + + r = sd_device_get_devname(dev, &dn); + if (r < 0) + return 0; + + r = unit_name_from_path(dn, ".swap", &e); + if (r < 0) + return r; + + u = manager_get_unit(m, e); + if (u) + r = swap_set_devnode(SWAP(u), dn); + + FOREACH_DEVICE_DEVLINK(dev, devlink) { + _cleanup_free_ char *n = NULL; + int q; + + q = unit_name_from_path(devlink, ".swap", &n); + if (q < 0) + return q; + + u = manager_get_unit(m, n); + if (u) { + q = swap_set_devnode(SWAP(u), dn); + if (q < 0) + r = q; + } + } + + return r; +} + +int swap_process_device_remove(Manager *m, sd_device *dev) { + const char *dn; + int r = 0; + Swap *s; + + r = sd_device_get_devname(dev, &dn); + if (r < 0) + return 0; + + while ((s = hashmap_get(m->swaps_by_devnode, dn))) { + int q; + + q = swap_set_devnode(s, NULL); + if (q < 0) + r = q; + } + + return r; +} + +static void swap_reset_failed(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + if (s->state == SWAP_FAILED) + swap_set_state(s, SWAP_DEAD); + + s->result = SWAP_SUCCESS; +} + +static int swap_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, SWAP(u)->control_pid, error); +} + +static int swap_get_timeout(Unit *u, usec_t *timeout) { + Swap *s = SWAP(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static bool swap_supported(void) { + static int supported = -1; + + /* If swap support is not available in the kernel, or we are + * running in a container we don't support swap units, and any + * attempts to starting one should fail immediately. */ + + if (supported < 0) + supported = + access("/proc/swaps", F_OK) >= 0 && + detect_container() <= 0; + + return supported; +} + +static int swap_control_pid(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + return s->control_pid; +} + +static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = { + [SWAP_EXEC_ACTIVATE] = "ExecActivate", + [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate", +}; + +DEFINE_STRING_TABLE_LOOKUP(swap_exec_command, SwapExecCommand); + +static const char* const swap_result_table[_SWAP_RESULT_MAX] = { + [SWAP_SUCCESS] = "success", + [SWAP_FAILURE_RESOURCES] = "resources", + [SWAP_FAILURE_TIMEOUT] = "timeout", + [SWAP_FAILURE_EXIT_CODE] = "exit-code", + [SWAP_FAILURE_SIGNAL] = "signal", + [SWAP_FAILURE_CORE_DUMP] = "core-dump", + [SWAP_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(swap_result, SwapResult); + +const UnitVTable swap_vtable = { + .object_size = sizeof(Swap), + .exec_context_offset = offsetof(Swap, exec_context), + .cgroup_context_offset = offsetof(Swap, cgroup_context), + .kill_context_offset = offsetof(Swap, kill_context), + .exec_runtime_offset = offsetof(Swap, exec_runtime), + .dynamic_creds_offset = offsetof(Swap, dynamic_creds), + + .sections = + "Unit\0" + "Swap\0" + "Install\0", + .private_section = "Swap", + + .init = swap_init, + .load = swap_load, + .done = swap_done, + + .coldplug = swap_coldplug, + + .dump = swap_dump, + + .start = swap_start, + .stop = swap_stop, + + .kill = swap_kill, + + .get_timeout = swap_get_timeout, + + .serialize = swap_serialize, + .deserialize_item = swap_deserialize_item, + + .active_state = swap_active_state, + .sub_state_to_string = swap_sub_state_to_string, + + .may_gc = swap_may_gc, + + .sigchld_event = swap_sigchld_event, + + .reset_failed = swap_reset_failed, + + .control_pid = swap_control_pid, + + .bus_vtable = bus_swap_vtable, + .bus_set_property = bus_swap_set_property, + .bus_commit_properties = bus_swap_commit_properties, + + .following = swap_following, + .following_set = swap_following_set, + + .enumerate = swap_enumerate, + .shutdown = swap_shutdown, + .supported = swap_supported, + + .status_message_formats = { + .starting_stopping = { + [0] = "Activating swap %s...", + [1] = "Deactivating swap %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Activated swap %s.", + [JOB_FAILED] = "Failed to activate swap %s.", + [JOB_TIMEOUT] = "Timed out activating swap %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Deactivated swap %s.", + [JOB_FAILED] = "Failed deactivating swap %s.", + [JOB_TIMEOUT] = "Timed out deactivating swap %s.", + }, + }, +}; diff --git a/src/core/swap.h b/src/core/swap.h new file mode 100644 index 0000000..1a4b60b --- /dev/null +++ b/src/core/swap.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "sd-device.h" +#include "unit.h" + +typedef struct Swap Swap; + +typedef enum SwapExecCommand { + SWAP_EXEC_ACTIVATE, + SWAP_EXEC_DEACTIVATE, + _SWAP_EXEC_COMMAND_MAX, + _SWAP_EXEC_COMMAND_INVALID = -1 +} SwapExecCommand; + +typedef enum SwapResult { + SWAP_SUCCESS, + SWAP_FAILURE_RESOURCES, + SWAP_FAILURE_TIMEOUT, + SWAP_FAILURE_EXIT_CODE, + SWAP_FAILURE_SIGNAL, + SWAP_FAILURE_CORE_DUMP, + SWAP_FAILURE_START_LIMIT_HIT, + _SWAP_RESULT_MAX, + _SWAP_RESULT_INVALID = -1 +} SwapResult; + +typedef struct SwapParameters { + char *what; + char *options; + int priority; +} SwapParameters; + +struct Swap { + Unit meta; + + char *what; + + /* If the device has already shown up, this is the device + * node, which might be different from what, due to + * symlinks */ + char *devnode; + + SwapParameters parameters_proc_swaps; + SwapParameters parameters_fragment; + + bool from_proc_swaps:1; + bool from_fragment:1; + + /* Used while looking for swaps that vanished or got added + * from/to /proc/swaps */ + bool is_active:1; + bool just_activated:1; + + SwapResult result; + + usec_t timeout_usec; + + ExecCommand exec_command[_SWAP_EXEC_COMMAND_MAX]; + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + SwapState state, deserialized_state; + + ExecCommand* control_command; + SwapExecCommand control_command_id; + pid_t control_pid; + + sd_event_source *timer_event_source; + + /* In order to be able to distinguish dependencies on + different device nodes we might end up creating multiple + devices for the same swap. We chain them up here. */ + + LIST_FIELDS(struct Swap, same_devnode); +}; + +extern const UnitVTable swap_vtable; + +int swap_process_device_new(Manager *m, sd_device *dev); +int swap_process_device_remove(Manager *m, sd_device *dev); + +const char* swap_exec_command_to_string(SwapExecCommand i) _const_; +SwapExecCommand swap_exec_command_from_string(const char *s) _pure_; + +const char* swap_result_to_string(SwapResult i) _const_; +SwapResult swap_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SWAP, Swap); diff --git a/src/core/system.conf.in b/src/core/system.conf.in new file mode 100644 index 0000000..0a58737 --- /dev/null +++ b/src/core/system.conf.in @@ -0,0 +1,64 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# Entries in this file show the compile time defaults. +# You can change settings by editing this file. +# Defaults can be restored by simply deleting this file. +# +# See systemd-system.conf(5) for details. + +[Manager] +#LogLevel=info +#LogTarget=journal-or-kmsg +#LogColor=yes +#LogLocation=no +#DumpCore=yes +#ShowStatus=yes +#CrashChangeVT=no +#CrashShell=no +#CrashReboot=no +#CtrlAltDelBurstAction=reboot-force +#CPUAffinity=1 2 +#RuntimeWatchdogSec=0 +#ShutdownWatchdogSec=10min +#WatchdogDevice= +#CapabilityBoundingSet= +#NoNewPrivileges=no +#SystemCallArchitectures= +#TimerSlackNSec= +#DefaultTimerAccuracySec=1min +#DefaultStandardOutput=journal +#DefaultStandardError=inherit +#DefaultTimeoutStartSec=90s +#DefaultTimeoutStopSec=90s +#DefaultRestartSec=100ms +#DefaultStartLimitIntervalSec=10s +#DefaultStartLimitBurst=5 +#DefaultEnvironment= +#DefaultCPUAccounting=no +#DefaultIOAccounting=no +#DefaultIPAccounting=no +#DefaultBlockIOAccounting=no +#DefaultMemoryAccounting=@MEMORY_ACCOUNTING_DEFAULT@ +#DefaultTasksAccounting=yes +#DefaultTasksMax=15% +#DefaultLimitCPU= +#DefaultLimitFSIZE= +#DefaultLimitDATA= +#DefaultLimitSTACK= +#DefaultLimitCORE= +#DefaultLimitRSS= +#DefaultLimitNOFILE=1024:@HIGH_RLIMIT_NOFILE@ +#DefaultLimitAS= +#DefaultLimitNPROC= +#DefaultLimitMEMLOCK= +#DefaultLimitLOCKS= +#DefaultLimitSIGPENDING= +#DefaultLimitMSGQUEUE= +#DefaultLimitNICE= +#DefaultLimitRTPRIO= +#DefaultLimitRTTIME= diff --git a/src/core/systemd.pc.in b/src/core/systemd.pc.in new file mode 100644 index 0000000..0dae950 --- /dev/null +++ b/src/core/systemd.pc.in @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +prefix=@prefix@ +systemdutildir=@rootlibexecdir@ +systemdsystemunitdir=@systemunitdir@ +systemdsystempresetdir=@systempresetdir@ +systemduserunitdir=@userunitdir@ +systemduserpresetdir=@userpresetdir@ +systemdsystemconfdir=@pkgsysconfdir@/system +systemduserconfdir=@pkgsysconfdir@/user +systemdsystemunitpath=${systemdsystemconfdir}:/etc/systemd/system:/run/systemd/system:/usr/local/lib/systemd/system:${systemdsystemunitdir}:/usr/lib/systemd/system:/lib/systemd/system +systemduserunitpath=${systemduserconfdir}:/etc/systemd/user:/run/systemd/user:/usr/local/lib/systemd/user:/usr/local/share/systemd/user:${systemduserunitdir}:/usr/lib/systemd/user:/usr/share/systemd/user +systemdsystemgeneratordir=@systemgeneratordir@ +systemdusergeneratordir=@usergeneratordir@ +systemdsleepdir=@systemsleepdir@ +systemdshutdowndir=@systemshutdowndir@ +tmpfilesdir=@tmpfilesdir@ +sysusersdir=@sysusersdir@ +sysctldir=@sysctldir@ +binfmtdir=@binfmtdir@ +modulesloaddir=@modulesloaddir@ +catalogdir=@catalogdir@ +systemuidmax=@systemuidmax@ +systemgidmax=@systemgidmax@ +dynamicuidmin=@dynamicuidmin@ +dynamicuidmax=@dynamicuidmax@ +containeruidbasemin=@containeruidbasemin@ +containeruidbasemax=@containeruidbasemax@ + +Name: systemd +Description: systemd System and Service Manager +URL: @PROJECT_URL@ +Version: @PROJECT_VERSION@ diff --git a/src/core/target.c b/src/core/target.c new file mode 100644 index 0000000..421a304 --- /dev/null +++ b/src/core/target.c @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "dbus-target.h" +#include "dbus-unit.h" +#include "log.h" +#include "serialize.h" +#include "special.h" +#include "string-util.h" +#include "target.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = { + [TARGET_DEAD] = UNIT_INACTIVE, + [TARGET_ACTIVE] = UNIT_ACTIVE +}; + +static void target_set_state(Target *t, TargetState state) { + TargetState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != old_state) + log_debug("%s changed %s -> %s", + UNIT(t)->id, + target_state_to_string(old_state), + target_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int target_add_default_dependencies(Target *t) { + + static const UnitDependency deps[] = { + UNIT_REQUIRES, + UNIT_REQUISITE, + UNIT_WANTS, + UNIT_BINDS_TO, + UNIT_PART_OF + }; + + int r; + unsigned k; + + assert(t); + + if (!UNIT(t)->default_dependencies) + return 0; + + /* Imply ordering for requirement dependencies on target units. Note that when the user created a contradicting + * ordering manually we won't add anything in here to make sure we don't create a loop. */ + + for (k = 0; k < ELEMENTSOF(deps); k++) { + Unit *other; + Iterator i; + void *v; + + HASHMAP_FOREACH_KEY(v, other, UNIT(t)->dependencies[deps[k]], i) { + r = unit_add_default_target_dependency(other, UNIT(t)); + if (r < 0) + return r; + } + } + + if (unit_has_name(UNIT(t), SPECIAL_SHUTDOWN_TARGET)) + return 0; + + /* Make sure targets are unloaded on shutdown */ + return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int target_load(Unit *u) { + Target *t = TARGET(u); + int r; + + assert(t); + + r = unit_load_fragment_and_dropin(u); + if (r < 0) + return r; + + /* This is a new unit? Then let's add in some extras */ + if (u->load_state == UNIT_LOADED) { + r = target_add_default_dependencies(t); + if (r < 0) + return r; + } + + return 0; +} + +static int target_coldplug(Unit *u) { + Target *t = TARGET(u); + + assert(t); + assert(t->state == TARGET_DEAD); + + if (t->deserialized_state != t->state) + target_set_state(t, t->deserialized_state); + + return 0; +} + +static void target_dump(Unit *u, FILE *f, const char *prefix) { + Target *t = TARGET(u); + + assert(t); + assert(f); + + fprintf(f, + "%sTarget State: %s\n", + prefix, target_state_to_string(t->state)); +} + +static int target_start(Unit *u) { + Target *t = TARGET(u); + int r; + + assert(t); + assert(t->state == TARGET_DEAD); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + target_set_state(t, TARGET_ACTIVE); + return 1; +} + +static int target_stop(Unit *u) { + Target *t = TARGET(u); + + assert(t); + assert(t->state == TARGET_ACTIVE); + + target_set_state(t, TARGET_DEAD); + return 1; +} + +static int target_serialize(Unit *u, FILE *f, FDSet *fds) { + Target *s = TARGET(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", target_state_to_string(s->state)); + return 0; +} + +static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Target *s = TARGET(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + TargetState state; + + state = target_state_from_string(value); + if (state < 0) + log_debug("Failed to parse state value %s", value); + else + s->deserialized_state = state; + + } else + log_debug("Unknown serialization key '%s'", key); + + return 0; +} + +_pure_ static UnitActiveState target_active_state(Unit *u) { + assert(u); + + return state_translation_table[TARGET(u)->state]; +} + +_pure_ static const char *target_sub_state_to_string(Unit *u) { + assert(u); + + return target_state_to_string(TARGET(u)->state); +} + +const UnitVTable target_vtable = { + .object_size = sizeof(Target), + + .sections = + "Unit\0" + "Target\0" + "Install\0", + + .load = target_load, + .coldplug = target_coldplug, + + .dump = target_dump, + + .start = target_start, + .stop = target_stop, + + .serialize = target_serialize, + .deserialize_item = target_deserialize_item, + + .active_state = target_active_state, + .sub_state_to_string = target_sub_state_to_string, + + .bus_vtable = bus_target_vtable, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Reached target %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Stopped target %s.", + }, + }, +}; diff --git a/src/core/target.h b/src/core/target.h new file mode 100644 index 0000000..28f7888 --- /dev/null +++ b/src/core/target.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "unit.h" + +typedef struct Target Target; + +struct Target { + Unit meta; + + TargetState state, deserialized_state; +}; + +extern const UnitVTable target_vtable; + +DEFINE_CAST(TARGET, Target); diff --git a/src/core/timer.c b/src/core/timer.c new file mode 100644 index 0000000..d9ba2f7 --- /dev/null +++ b/src/core/timer.c @@ -0,0 +1,881 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-timer.h" +#include "dbus-unit.h" +#include "fs-util.h" +#include "parse-util.h" +#include "random-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "timer.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" + +static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = { + [TIMER_DEAD] = UNIT_INACTIVE, + [TIMER_WAITING] = UNIT_ACTIVE, + [TIMER_RUNNING] = UNIT_ACTIVE, + [TIMER_ELAPSED] = UNIT_ACTIVE, + [TIMER_FAILED] = UNIT_FAILED +}; + +static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata); + +static void timer_init(Unit *u) { + Timer *t = TIMER(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + t->next_elapse_monotonic_or_boottime = USEC_INFINITY; + t->next_elapse_realtime = USEC_INFINITY; + t->accuracy_usec = u->manager->default_timer_accuracy_usec; + t->remain_after_elapse = true; +} + +void timer_free_values(Timer *t) { + TimerValue *v; + + assert(t); + + while ((v = t->values)) { + LIST_REMOVE(value, t->values, v); + calendar_spec_free(v->calendar_spec); + free(v); + } +} + +static void timer_done(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + + timer_free_values(t); + + t->monotonic_event_source = sd_event_source_unref(t->monotonic_event_source); + t->realtime_event_source = sd_event_source_unref(t->realtime_event_source); + + free(t->stamp_path); +} + +static int timer_verify(Timer *t) { + assert(t); + + if (UNIT(t)->load_state != UNIT_LOADED) + return 0; + + if (!t->values) { + log_unit_error(UNIT(t), "Timer unit lacks value setting. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int timer_add_default_dependencies(Timer *t) { + int r; + TimerValue *v; + + assert(t); + + if (!UNIT(t)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(t), UNIT_BEFORE, SPECIAL_TIMERS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(t), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + if (v->base == TIMER_CALENDAR) { + r = unit_add_dependency_by_name(UNIT(t), UNIT_AFTER, SPECIAL_TIME_SYNC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + break; + } + } + } + + return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int timer_add_trigger_dependencies(Timer *t) { + Unit *x; + int r; + + assert(t); + + if (!hashmap_isempty(UNIT(t)->dependencies[UNIT_TRIGGERS])) + return 0; + + r = unit_load_related_unit(UNIT(t), ".service", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(t), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int timer_setup_persistent(Timer *t) { + int r; + + assert(t); + + if (!t->persistent) + return 0; + + if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { + + r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + t->stamp_path = strappend("/var/lib/systemd/timers/stamp-", UNIT(t)->id); + } else { + const char *e; + + e = getenv("XDG_DATA_HOME"); + if (e) + t->stamp_path = strjoin(e, "/systemd/timers/stamp-", UNIT(t)->id); + else { + + _cleanup_free_ char *h = NULL; + + r = get_home_dir(&h); + if (r < 0) + return log_unit_error_errno(UNIT(t), r, "Failed to determine home directory: %m"); + + t->stamp_path = strjoin(h, "/.local/share/systemd/timers/stamp-", UNIT(t)->id); + } + } + + if (!t->stamp_path) + return log_oom(); + + return 0; +} + +static int timer_load(Unit *u) { + Timer *t = TIMER(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_LOADED) { + + r = timer_add_trigger_dependencies(t); + if (r < 0) + return r; + + r = timer_setup_persistent(t); + if (r < 0) + return r; + + r = timer_add_default_dependencies(t); + if (r < 0) + return r; + } + + return timer_verify(t); +} + +static void timer_dump(Unit *u, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESPAN_MAX]; + Timer *t = TIMER(u); + Unit *trigger; + TimerValue *v; + + trigger = UNIT_TRIGGER(u); + + fprintf(f, + "%sTimer State: %s\n" + "%sResult: %s\n" + "%sUnit: %s\n" + "%sPersistent: %s\n" + "%sWakeSystem: %s\n" + "%sAccuracy: %s\n" + "%sRemainAfterElapse: %s\n", + prefix, timer_state_to_string(t->state), + prefix, timer_result_to_string(t->result), + prefix, trigger ? trigger->id : "n/a", + prefix, yes_no(t->persistent), + prefix, yes_no(t->wake_system), + prefix, format_timespan(buf, sizeof(buf), t->accuracy_usec, 1), + prefix, yes_no(t->remain_after_elapse)); + + LIST_FOREACH(value, v, t->values) { + + if (v->base == TIMER_CALENDAR) { + _cleanup_free_ char *p = NULL; + + (void) calendar_spec_to_string(v->calendar_spec, &p); + + fprintf(f, + "%s%s: %s\n", + prefix, + timer_base_to_string(v->base), + strna(p)); + } else { + char timespan1[FORMAT_TIMESPAN_MAX]; + + fprintf(f, + "%s%s: %s\n", + prefix, + timer_base_to_string(v->base), + format_timespan(timespan1, sizeof(timespan1), v->value, 0)); + } + } +} + +static void timer_set_state(Timer *t, TimerState state) { + TimerState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != TIMER_WAITING) { + t->monotonic_event_source = sd_event_source_unref(t->monotonic_event_source); + t->realtime_event_source = sd_event_source_unref(t->realtime_event_source); + t->next_elapse_monotonic_or_boottime = USEC_INFINITY; + t->next_elapse_realtime = USEC_INFINITY; + } + + if (state != old_state) + log_unit_debug(UNIT(t), "Changed %s -> %s", timer_state_to_string(old_state), timer_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0); +} + +static void timer_enter_waiting(Timer *t, bool time_change); + +static int timer_coldplug(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + assert(t->state == TIMER_DEAD); + + if (t->deserialized_state == t->state) + return 0; + + if (t->deserialized_state == TIMER_WAITING) + timer_enter_waiting(t, false); + else + timer_set_state(t, t->deserialized_state); + + return 0; +} + +static void timer_enter_dead(Timer *t, TimerResult f) { + assert(t); + + if (t->result == TIMER_SUCCESS) + t->result = f; + + unit_log_result(UNIT(t), t->result == TIMER_SUCCESS, timer_result_to_string(t->result)); + timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD); +} + +static void timer_enter_elapsed(Timer *t, bool leave_around) { + assert(t); + + /* If a unit is marked with RemainAfterElapse=yes we leave it + * around even after it elapsed once, so that starting it + * later again does not necessarily mean immediate + * retriggering. We unconditionally leave units with + * TIMER_UNIT_ACTIVE or TIMER_UNIT_INACTIVE triggers around, + * since they might be restarted automatically at any time + * later on. */ + + if (t->remain_after_elapse || leave_around) + timer_set_state(t, TIMER_ELAPSED); + else + timer_enter_dead(t, TIMER_SUCCESS); +} + +static void add_random(Timer *t, usec_t *v) { + char s[FORMAT_TIMESPAN_MAX]; + usec_t add; + + assert(t); + assert(v); + + if (t->random_usec == 0) + return; + if (*v == USEC_INFINITY) + return; + + add = random_u64() % t->random_usec; + + if (*v + add < *v) /* overflow */ + *v = (usec_t) -2; /* Highest possible value, that is not USEC_INFINITY */ + else + *v += add; + + log_unit_debug(UNIT(t), "Adding %s random time.", format_timespan(s, sizeof(s), add, 0)); +} + +static void timer_enter_waiting(Timer *t, bool time_change) { + bool found_monotonic = false, found_realtime = false; + bool leave_around = false; + triple_timestamp ts; + TimerValue *v; + Unit *trigger; + int r; + + assert(t); + + trigger = UNIT_TRIGGER(UNIT(t)); + if (!trigger) { + log_unit_error(UNIT(t), "Unit to trigger vanished."); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); + return; + } + + triple_timestamp_get(&ts); + t->next_elapse_monotonic_or_boottime = t->next_elapse_realtime = 0; + + LIST_FOREACH(value, v, t->values) { + if (v->disabled) + continue; + + if (v->base == TIMER_CALENDAR) { + usec_t b; + + /* If we know the last time this was + * triggered, schedule the job based relative + * to that. If we don't, just start from + * the activation time. */ + + if (t->last_trigger.realtime > 0) + b = t->last_trigger.realtime; + else { + if (state_translation_table[t->state] == UNIT_ACTIVE) + b = UNIT(t)->inactive_exit_timestamp.realtime; + else + b = ts.realtime; + } + + r = calendar_spec_next_usec(v->calendar_spec, b, &v->next_elapse); + if (r < 0) + continue; + + if (!found_realtime) + t->next_elapse_realtime = v->next_elapse; + else + t->next_elapse_realtime = MIN(t->next_elapse_realtime, v->next_elapse); + + found_realtime = true; + + } else { + usec_t base; + + switch (v->base) { + + case TIMER_ACTIVE: + if (state_translation_table[t->state] == UNIT_ACTIVE) + base = UNIT(t)->inactive_exit_timestamp.monotonic; + else + base = ts.monotonic; + break; + + case TIMER_BOOT: + if (detect_container() <= 0) { + /* CLOCK_MONOTONIC equals the uptime on Linux */ + base = 0; + break; + } + /* In a container we don't want to include the time the host + * was already up when the container started, so count from + * our own startup. */ + _fallthrough_; + case TIMER_STARTUP: + base = UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + break; + + case TIMER_UNIT_ACTIVE: + leave_around = true; + base = trigger->inactive_exit_timestamp.monotonic; + + if (base <= 0) + base = t->last_trigger.monotonic; + + if (base <= 0) + continue; + base = MAX(base, t->last_trigger.monotonic); + + break; + + case TIMER_UNIT_INACTIVE: + leave_around = true; + base = trigger->inactive_enter_timestamp.monotonic; + + if (base <= 0) + base = t->last_trigger.monotonic; + + if (base <= 0) + continue; + base = MAX(base, t->last_trigger.monotonic); + + break; + + default: + assert_not_reached("Unknown timer base"); + } + + v->next_elapse = usec_add(usec_shift_clock(base, CLOCK_MONOTONIC, TIMER_MONOTONIC_CLOCK(t)), v->value); + + if (dual_timestamp_is_set(&t->last_trigger) && + !time_change && + v->next_elapse < triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)) && + IN_SET(v->base, TIMER_ACTIVE, TIMER_BOOT, TIMER_STARTUP)) { + /* This is a one time trigger, disable it now */ + v->disabled = true; + continue; + } + + if (!found_monotonic) + t->next_elapse_monotonic_or_boottime = v->next_elapse; + else + t->next_elapse_monotonic_or_boottime = MIN(t->next_elapse_monotonic_or_boottime, v->next_elapse); + + found_monotonic = true; + } + } + + if (!found_monotonic && !found_realtime) { + log_unit_debug(UNIT(t), "Timer is elapsed."); + timer_enter_elapsed(t, leave_around); + return; + } + + if (found_monotonic) { + char buf[FORMAT_TIMESPAN_MAX]; + usec_t left; + + add_random(t, &t->next_elapse_monotonic_or_boottime); + + left = usec_sub_unsigned(t->next_elapse_monotonic_or_boottime, triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t))); + log_unit_debug(UNIT(t), "Monotonic timer elapses in %s.", format_timespan(buf, sizeof(buf), left, 0)); + + if (t->monotonic_event_source) { + r = sd_event_source_set_time(t->monotonic_event_source, t->next_elapse_monotonic_or_boottime); + if (r < 0) + goto fail; + + r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_ONESHOT); + if (r < 0) + goto fail; + } else { + + r = sd_event_add_time( + UNIT(t)->manager->event, + &t->monotonic_event_source, + t->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC, + t->next_elapse_monotonic_or_boottime, t->accuracy_usec, + timer_dispatch, t); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(t->monotonic_event_source, "timer-monotonic"); + } + + } else if (t->monotonic_event_source) { + + r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_OFF); + if (r < 0) + goto fail; + } + + if (found_realtime) { + char buf[FORMAT_TIMESTAMP_MAX]; + + add_random(t, &t->next_elapse_realtime); + + log_unit_debug(UNIT(t), "Realtime timer elapses at %s.", format_timestamp(buf, sizeof(buf), t->next_elapse_realtime)); + + if (t->realtime_event_source) { + r = sd_event_source_set_time(t->realtime_event_source, t->next_elapse_realtime); + if (r < 0) + goto fail; + + r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_ONESHOT); + if (r < 0) + goto fail; + } else { + r = sd_event_add_time( + UNIT(t)->manager->event, + &t->realtime_event_source, + t->wake_system ? CLOCK_REALTIME_ALARM : CLOCK_REALTIME, + t->next_elapse_realtime, t->accuracy_usec, + timer_dispatch, t); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(t->realtime_event_source, "timer-realtime"); + } + + } else if (t->realtime_event_source) { + + r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_OFF); + if (r < 0) + goto fail; + } + + timer_set_state(t, TIMER_WAITING); + return; + +fail: + log_unit_warning_errno(UNIT(t), r, "Failed to enter waiting state: %m"); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); +} + +static void timer_enter_running(Timer *t) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + int r; + + assert(t); + + /* Don't start job if we are supposed to go down */ + if (unit_stop_pending(UNIT(t))) + return; + + trigger = UNIT_TRIGGER(UNIT(t)); + if (!trigger) { + log_unit_error(UNIT(t), "Unit to trigger vanished."); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); + return; + } + + r = manager_add_job(UNIT(t)->manager, JOB_START, trigger, JOB_REPLACE, &error, NULL); + if (r < 0) + goto fail; + + dual_timestamp_get(&t->last_trigger); + + if (t->stamp_path) + touch_file(t->stamp_path, true, t->last_trigger.realtime, UID_INVALID, GID_INVALID, MODE_INVALID); + + timer_set_state(t, TIMER_RUNNING); + return; + +fail: + log_unit_warning(UNIT(t), "Failed to queue unit startup job: %s", bus_error_message(&error, r)); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); +} + +static int timer_start(Unit *u) { + Timer *t = TIMER(u); + TimerValue *v; + Unit *trigger; + int r; + + assert(t); + assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED)); + + trigger = UNIT_TRIGGER(u); + if (!trigger || trigger->load_state != UNIT_LOADED) { + log_unit_error(u, "Refusing to start, unit to trigger not loaded."); + return -ENOENT; + } + + r = unit_start_limit_test(u); + if (r < 0) { + timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + t->last_trigger = DUAL_TIMESTAMP_NULL; + + /* Reenable all timers that depend on unit activation time */ + LIST_FOREACH(value, v, t->values) + if (v->base == TIMER_ACTIVE) + v->disabled = false; + + if (t->stamp_path) { + struct stat st; + + if (stat(t->stamp_path, &st) >= 0) { + usec_t ft; + + /* Load the file timestamp, but only if it is actually in the past. If it is in the future, + * something is wrong with the system clock. */ + + ft = timespec_load(&st.st_mtim); + if (ft < now(CLOCK_REALTIME)) + t->last_trigger.realtime = ft; + else { + char z[FORMAT_TIMESTAMP_MAX]; + + log_unit_warning(u, "Not using persistent file timestamp %s as it is in the future.", + format_timestamp(z, sizeof(z), ft)); + } + + } else if (errno == ENOENT) + /* The timer has never run before, + * make sure a stamp file exists. + */ + (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); + } + + t->result = TIMER_SUCCESS; + timer_enter_waiting(t, false); + return 1; +} + +static int timer_stop(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED)); + + timer_enter_dead(t, TIMER_SUCCESS); + return 1; +} + +static int timer_serialize(Unit *u, FILE *f, FDSet *fds) { + Timer *t = TIMER(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", timer_state_to_string(t->state)); + (void) serialize_item(f, "result", timer_result_to_string(t->result)); + + if (t->last_trigger.realtime > 0) + (void) serialize_usec(f, "last-trigger-realtime", t->last_trigger.realtime); + + if (t->last_trigger.monotonic > 0) + (void) serialize_usec(f, "last-trigger-monotonic", t->last_trigger.monotonic); + + return 0; +} + +static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Timer *t = TIMER(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + TimerState state; + + state = timer_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + t->deserialized_state = state; + + } else if (streq(key, "result")) { + TimerResult f; + + f = timer_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != TIMER_SUCCESS) + t->result = f; + + } else if (streq(key, "last-trigger-realtime")) + (void) deserialize_usec(value, &t->last_trigger.realtime); + else if (streq(key, "last-trigger-monotonic")) + (void) deserialize_usec(value, &t->last_trigger.monotonic); + else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState timer_active_state(Unit *u) { + assert(u); + + return state_translation_table[TIMER(u)->state]; +} + +_pure_ static const char *timer_sub_state_to_string(Unit *u) { + assert(u); + + return timer_state_to_string(TIMER(u)->state); +} + +static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) { + Timer *t = TIMER(userdata); + + assert(t); + + if (t->state != TIMER_WAITING) + return 0; + + log_unit_debug(UNIT(t), "Timer elapsed."); + timer_enter_running(t); + return 0; +} + +static void timer_trigger_notify(Unit *u, Unit *other) { + Timer *t = TIMER(u); + TimerValue *v; + + assert(u); + assert(other); + + if (other->load_state != UNIT_LOADED) + return; + + /* Reenable all timers that depend on unit state */ + LIST_FOREACH(value, v, t->values) + if (IN_SET(v->base, TIMER_UNIT_ACTIVE, TIMER_UNIT_INACTIVE)) + v->disabled = false; + + switch (t->state) { + + case TIMER_WAITING: + case TIMER_ELAPSED: + + /* Recalculate sleep time */ + timer_enter_waiting(t, false); + break; + + case TIMER_RUNNING: + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + log_unit_debug(UNIT(t), "Got notified about unit deactivation."); + timer_enter_waiting(t, false); + } + break; + + case TIMER_DEAD: + case TIMER_FAILED: + break; + + default: + assert_not_reached("Unknown timer state"); + } +} + +static void timer_reset_failed(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + + if (t->state == TIMER_FAILED) + timer_set_state(t, TIMER_DEAD); + + t->result = TIMER_SUCCESS; +} + +static void timer_time_change(Unit *u) { + Timer *t = TIMER(u); + usec_t ts; + + assert(u); + + if (t->state != TIMER_WAITING) + return; + + /* If we appear to have triggered in the future, the system clock must + * have been set backwards. So let's rewind our own clock and allow + * the future trigger(s) to happen again :). Exactly the same as when + * you start a timer unit with Persistent=yes. */ + ts = now(CLOCK_REALTIME); + if (t->last_trigger.realtime > ts) + t->last_trigger.realtime = ts; + + log_unit_debug(u, "Time change, recalculating next elapse."); + timer_enter_waiting(t, true); +} + +static void timer_timezone_change(Unit *u) { + Timer *t = TIMER(u); + + assert(u); + + if (t->state != TIMER_WAITING) + return; + + log_unit_debug(u, "Timezone change, recalculating next elapse."); + timer_enter_waiting(t, false); +} + +static const char* const timer_base_table[_TIMER_BASE_MAX] = { + [TIMER_ACTIVE] = "OnActiveSec", + [TIMER_BOOT] = "OnBootSec", + [TIMER_STARTUP] = "OnStartupSec", + [TIMER_UNIT_ACTIVE] = "OnUnitActiveSec", + [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec", + [TIMER_CALENDAR] = "OnCalendar" +}; + +DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase); + +static const char* const timer_result_table[_TIMER_RESULT_MAX] = { + [TIMER_SUCCESS] = "success", + [TIMER_FAILURE_RESOURCES] = "resources", + [TIMER_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(timer_result, TimerResult); + +const UnitVTable timer_vtable = { + .object_size = sizeof(Timer), + + .sections = + "Unit\0" + "Timer\0" + "Install\0", + .private_section = "Timer", + + .init = timer_init, + .done = timer_done, + .load = timer_load, + + .coldplug = timer_coldplug, + + .dump = timer_dump, + + .start = timer_start, + .stop = timer_stop, + + .serialize = timer_serialize, + .deserialize_item = timer_deserialize_item, + + .active_state = timer_active_state, + .sub_state_to_string = timer_sub_state_to_string, + + .trigger_notify = timer_trigger_notify, + + .reset_failed = timer_reset_failed, + .time_change = timer_time_change, + .timezone_change = timer_timezone_change, + + .bus_vtable = bus_timer_vtable, + .bus_set_property = bus_timer_set_property, + + .can_transient = true, +}; diff --git a/src/core/timer.h b/src/core/timer.h new file mode 100644 index 0000000..833aadb --- /dev/null +++ b/src/core/timer.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Timer Timer; + +#include "calendarspec.h" +#include "unit.h" + +typedef enum TimerBase { + TIMER_ACTIVE, + TIMER_BOOT, + TIMER_STARTUP, + TIMER_UNIT_ACTIVE, + TIMER_UNIT_INACTIVE, + TIMER_CALENDAR, + _TIMER_BASE_MAX, + _TIMER_BASE_INVALID = -1 +} TimerBase; + +typedef struct TimerValue { + TimerBase base; + bool disabled; + + usec_t value; /* only for monotonic events */ + CalendarSpec *calendar_spec; /* only for calendar events */ + usec_t next_elapse; + + LIST_FIELDS(struct TimerValue, value); +} TimerValue; + +typedef enum TimerResult { + TIMER_SUCCESS, + TIMER_FAILURE_RESOURCES, + TIMER_FAILURE_START_LIMIT_HIT, + _TIMER_RESULT_MAX, + _TIMER_RESULT_INVALID = -1 +} TimerResult; + +struct Timer { + Unit meta; + + usec_t accuracy_usec; + usec_t random_usec; + + LIST_HEAD(TimerValue, values); + usec_t next_elapse_realtime; + usec_t next_elapse_monotonic_or_boottime; + dual_timestamp last_trigger; + + TimerState state, deserialized_state; + + sd_event_source *monotonic_event_source; + sd_event_source *realtime_event_source; + + TimerResult result; + + bool persistent; + bool wake_system; + bool remain_after_elapse; + + char *stamp_path; +}; + +#define TIMER_MONOTONIC_CLOCK(t) ((t)->wake_system && clock_boottime_supported() ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC) + +void timer_free_values(Timer *t); + +extern const UnitVTable timer_vtable; + +const char *timer_base_to_string(TimerBase i) _const_; +TimerBase timer_base_from_string(const char *s) _pure_; + +const char* timer_result_to_string(TimerResult i) _const_; +TimerResult timer_result_from_string(const char *s) _pure_; + +DEFINE_CAST(TIMER, Timer); diff --git a/src/core/transaction.c b/src/core/transaction.c new file mode 100644 index 0000000..2418332 --- /dev/null +++ b/src/core/transaction.c @@ -0,0 +1,1119 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <fcntl.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "terminal-util.h" +#include "transaction.h" +#include "dbus-unit.h" + +static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies); + +static void transaction_delete_job(Transaction *tr, Job *j, bool delete_dependencies) { + assert(tr); + assert(j); + + /* Deletes one job from the transaction */ + + transaction_unlink_job(tr, j, delete_dependencies); + + job_free(j); +} + +static void transaction_delete_unit(Transaction *tr, Unit *u) { + Job *j; + + /* Deletes all jobs associated with a certain unit from the + * transaction */ + + while ((j = hashmap_get(tr->jobs, u))) + transaction_delete_job(tr, j, true); +} + +void transaction_abort(Transaction *tr) { + Job *j; + + assert(tr); + + while ((j = hashmap_first(tr->jobs))) + transaction_delete_job(tr, j, false); + + assert(hashmap_isempty(tr->jobs)); +} + +static void transaction_find_jobs_that_matter_to_anchor(Job *j, unsigned generation) { + JobDependency *l; + + /* A recursive sweep through the graph that marks all units + * that matter to the anchor job, i.e. are directly or + * indirectly a dependency of the anchor job via paths that + * are fully marked as mattering. */ + + j->matters_to_anchor = true; + j->generation = generation; + + LIST_FOREACH(subject, l, j->subject_list) { + + /* This link does not matter */ + if (!l->matters) + continue; + + /* This unit has already been marked */ + if (l->object->generation == generation) + continue; + + transaction_find_jobs_that_matter_to_anchor(l->object, generation); + } +} + +static void transaction_merge_and_delete_job(Transaction *tr, Job *j, Job *other, JobType t) { + JobDependency *l, *last; + + assert(j); + assert(other); + assert(j->unit == other->unit); + assert(!j->installed); + + /* Merges 'other' into 'j' and then deletes 'other'. */ + + j->type = t; + j->state = JOB_WAITING; + j->irreversible = j->irreversible || other->irreversible; + j->matters_to_anchor = j->matters_to_anchor || other->matters_to_anchor; + + /* Patch us in as new owner of the JobDependency objects */ + last = NULL; + LIST_FOREACH(subject, l, other->subject_list) { + assert(l->subject == other); + l->subject = j; + last = l; + } + + /* Merge both lists */ + if (last) { + last->subject_next = j->subject_list; + if (j->subject_list) + j->subject_list->subject_prev = last; + j->subject_list = other->subject_list; + } + + /* Patch us in as new owner of the JobDependency objects */ + last = NULL; + LIST_FOREACH(object, l, other->object_list) { + assert(l->object == other); + l->object = j; + last = l; + } + + /* Merge both lists */ + if (last) { + last->object_next = j->object_list; + if (j->object_list) + j->object_list->object_prev = last; + j->object_list = other->object_list; + } + + /* Kill the other job */ + other->subject_list = NULL; + other->object_list = NULL; + transaction_delete_job(tr, other, true); +} + +_pure_ static bool job_is_conflicted_by(Job *j) { + JobDependency *l; + + assert(j); + + /* Returns true if this job is pulled in by a least one + * ConflictedBy dependency. */ + + LIST_FOREACH(object, l, j->object_list) + if (l->conflicts) + return true; + + return false; +} + +static int delete_one_unmergeable_job(Transaction *tr, Job *j) { + Job *k; + + assert(j); + + /* Tries to delete one item in the linked list + * j->transaction_next->transaction_next->... that conflicts + * with another one, in an attempt to make an inconsistent + * transaction work. */ + + /* We rely here on the fact that if a merged with b does not + * merge with c, either a or b merge with c neither */ + LIST_FOREACH(transaction, j, j) + LIST_FOREACH(transaction, k, j->transaction_next) { + Job *d; + + /* Is this one mergeable? Then skip it */ + if (job_type_is_mergeable(j->type, k->type)) + continue; + + /* Ok, we found two that conflict, let's see if we can + * drop one of them */ + if (!j->matters_to_anchor && !k->matters_to_anchor) { + + /* Both jobs don't matter, so let's + * find the one that is smarter to + * remove. Let's think positive and + * rather remove stops then starts -- + * except if something is being + * stopped because it is conflicted by + * another unit in which case we + * rather remove the start. */ + + log_unit_debug(j->unit, + "Looking at job %s/%s conflicted_by=%s", + j->unit->id, job_type_to_string(j->type), + yes_no(j->type == JOB_STOP && job_is_conflicted_by(j))); + log_unit_debug(k->unit, + "Looking at job %s/%s conflicted_by=%s", + k->unit->id, job_type_to_string(k->type), + yes_no(k->type == JOB_STOP && job_is_conflicted_by(k))); + + if (j->type == JOB_STOP) { + + if (job_is_conflicted_by(j)) + d = k; + else + d = j; + + } else if (k->type == JOB_STOP) { + + if (job_is_conflicted_by(k)) + d = j; + else + d = k; + } else + d = j; + + } else if (!j->matters_to_anchor) + d = j; + else if (!k->matters_to_anchor) + d = k; + else + return -ENOEXEC; + + /* Ok, we can drop one, so let's do so. */ + log_unit_debug(d->unit, + "Fixing conflicting jobs %s/%s,%s/%s by deleting job %s/%s", + j->unit->id, job_type_to_string(j->type), + k->unit->id, job_type_to_string(k->type), + d->unit->id, job_type_to_string(d->type)); + transaction_delete_job(tr, d, true); + return 0; + } + + return -EINVAL; +} + +static int transaction_merge_jobs(Transaction *tr, sd_bus_error *e) { + Job *j; + Iterator i; + int r; + + assert(tr); + + /* First step, check whether any of the jobs for one specific + * task conflict. If so, try to drop one of them. */ + HASHMAP_FOREACH(j, tr->jobs, i) { + JobType t; + Job *k; + + t = j->type; + LIST_FOREACH(transaction, k, j->transaction_next) { + if (job_type_merge_and_collapse(&t, k->type, j->unit) >= 0) + continue; + + /* OK, we could not merge all jobs for this + * action. Let's see if we can get rid of one + * of them */ + + r = delete_one_unmergeable_job(tr, j); + if (r >= 0) + /* Ok, we managed to drop one, now + * let's ask our callers to call us + * again after garbage collecting */ + return -EAGAIN; + + /* We couldn't merge anything. Failure */ + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_JOBS_CONFLICTING, + "Transaction contains conflicting jobs '%s' and '%s' for %s. " + "Probably contradicting requirement dependencies configured.", + job_type_to_string(t), + job_type_to_string(k->type), + k->unit->id); + } + } + + /* Second step, merge the jobs. */ + HASHMAP_FOREACH(j, tr->jobs, i) { + JobType t = j->type; + Job *k; + + /* Merge all transaction jobs for j->unit */ + LIST_FOREACH(transaction, k, j->transaction_next) + assert_se(job_type_merge_and_collapse(&t, k->type, j->unit) == 0); + + while ((k = j->transaction_next)) { + if (tr->anchor_job == k) { + transaction_merge_and_delete_job(tr, k, j, t); + j = k; + } else + transaction_merge_and_delete_job(tr, j, k, t); + } + + assert(!j->transaction_next); + assert(!j->transaction_prev); + } + + return 0; +} + +static void transaction_drop_redundant(Transaction *tr) { + Job *j; + Iterator i; + + /* Goes through the transaction and removes all jobs of the units + * whose jobs are all noops. If not all of a unit's jobs are + * redundant, they are kept. */ + + assert(tr); + +rescan: + HASHMAP_FOREACH(j, tr->jobs, i) { + Job *k; + + LIST_FOREACH(transaction, k, j) { + + if (tr->anchor_job == k || + !job_type_is_redundant(k->type, unit_active_state(k->unit)) || + (k->unit->job && job_type_is_conflicting(k->type, k->unit->job->type))) + goto next_unit; + } + + log_trace("Found redundant job %s/%s, dropping from transaction.", j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, j, false); + goto rescan; + next_unit:; + } +} + +_pure_ static bool unit_matters_to_anchor(Unit *u, Job *j) { + assert(u); + assert(!j->transaction_prev); + + /* Checks whether at least one of the jobs for this unit + * matters to the anchor. */ + + LIST_FOREACH(transaction, j, j) + if (j->matters_to_anchor) + return true; + + return false; +} + +static char* merge_unit_ids(const char* unit_log_field, char **pairs) { + char **unit_id, **job_type, *ans = NULL; + size_t alloc = 0, size = 0, next; + + STRV_FOREACH_PAIR(unit_id, job_type, pairs) { + next = strlen(unit_log_field) + strlen(*unit_id); + if (!GREEDY_REALLOC(ans, alloc, size + next + 1)) { + return mfree(ans); + } + + sprintf(ans + size, "%s%s", unit_log_field, *unit_id); + if (*(unit_id+1)) + ans[size + next] = '\n'; + size += next + 1; + } + + return ans; +} + +static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsigned generation, sd_bus_error *e) { + Iterator i; + Unit *u; + void *v; + int r; + + assert(tr); + assert(j); + assert(!j->transaction_prev); + + /* Does a recursive sweep through the ordering graph, looking + * for a cycle. If we find a cycle we try to break it. */ + + /* Have we seen this before? */ + if (j->generation == generation) { + Job *k, *delete = NULL; + _cleanup_free_ char **array = NULL, *unit_ids = NULL; + char **unit_id, **job_type; + + /* If the marker is NULL we have been here already and + * decided the job was loop-free from here. Hence + * shortcut things and return right-away. */ + if (!j->marker) + return 0; + + /* So, the marker is not NULL and we already have been here. We have + * a cycle. Let's try to break it. We go backwards in our path and + * try to find a suitable job to remove. We use the marker to find + * our way back, since smart how we are we stored our way back in + * there. */ + + for (k = from; k; k = ((k->generation == generation && k->marker != k) ? k->marker : NULL)) { + + /* For logging below */ + if (strv_push_pair(&array, k->unit->id, (char*) job_type_to_string(k->type)) < 0) + log_oom(); + + if (!delete && hashmap_get(tr->jobs, k->unit) && !unit_matters_to_anchor(k->unit, k)) + /* Ok, we can drop this one, so let's do so. */ + delete = k; + + /* Check if this in fact was the beginning of the cycle */ + if (k == j) + break; + } + + unit_ids = merge_unit_ids(j->manager->unit_log_field, array); /* ignore error */ + + STRV_FOREACH_PAIR(unit_id, job_type, array) + /* logging for j not k here to provide a consistent narrative */ + log_struct(LOG_WARNING, + "MESSAGE=%s: Found %s on %s/%s", + j->unit->id, + unit_id == array ? "ordering cycle" : "dependency", + *unit_id, *job_type, + unit_ids); + + if (delete) { + const char *status; + /* logging for j not k here to provide a consistent narrative */ + log_struct(LOG_ERR, + "MESSAGE=%s: Job %s/%s deleted to break ordering cycle starting with %s/%s", + j->unit->id, delete->unit->id, job_type_to_string(delete->type), + j->unit->id, job_type_to_string(j->type), + unit_ids); + + if (log_get_show_color()) + status = ANSI_HIGHLIGHT_RED " SKIP " ANSI_NORMAL; + else + status = " SKIP "; + + unit_status_printf(delete->unit, status, + "Ordering cycle found, skipping %s"); + transaction_delete_unit(tr, delete->unit); + return -EAGAIN; + } + + log_struct(LOG_ERR, + "MESSAGE=%s: Unable to break cycle starting with %s/%s", + j->unit->id, j->unit->id, job_type_to_string(j->type), + unit_ids); + + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC, + "Transaction order is cyclic. See system logs for details."); + } + + /* Make the marker point to where we come from, so that we can + * find our way backwards if we want to break a cycle. We use + * a special marker for the beginning: we point to + * ourselves. */ + j->marker = from ? from : j; + j->generation = generation; + + /* We assume that the dependencies are bidirectional, and + * hence can ignore UNIT_AFTER */ + HASHMAP_FOREACH_KEY(v, u, j->unit->dependencies[UNIT_BEFORE], i) { + Job *o; + + /* Is there a job for this unit? */ + o = hashmap_get(tr->jobs, u); + if (!o) { + /* Ok, there is no job for this in the + * transaction, but maybe there is already one + * running? */ + o = u->job; + if (!o) + continue; + } + + r = transaction_verify_order_one(tr, o, j, generation, e); + if (r < 0) + return r; + } + + /* Ok, let's backtrack, and remember that this entry is not on + * our path anymore. */ + j->marker = NULL; + + return 0; +} + +static int transaction_verify_order(Transaction *tr, unsigned *generation, sd_bus_error *e) { + Job *j; + int r; + Iterator i; + unsigned g; + + assert(tr); + assert(generation); + + /* Check if the ordering graph is cyclic. If it is, try to fix + * that up by dropping one of the jobs. */ + + g = (*generation)++; + + HASHMAP_FOREACH(j, tr->jobs, i) { + r = transaction_verify_order_one(tr, j, NULL, g, e); + if (r < 0) + return r; + } + + return 0; +} + +static void transaction_collect_garbage(Transaction *tr) { + Iterator i; + Job *j; + + assert(tr); + + /* Drop jobs that are not required by any other job */ + +rescan: + HASHMAP_FOREACH(j, tr->jobs, i) { + if (tr->anchor_job == j) + continue; + if (j->object_list) { + log_trace("Keeping job %s/%s because of %s/%s", + j->unit->id, job_type_to_string(j->type), + j->object_list->subject ? j->object_list->subject->unit->id : "root", + j->object_list->subject ? job_type_to_string(j->object_list->subject->type) : "root"); + continue; + } + + log_trace("Garbage collecting job %s/%s", j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, j, true); + goto rescan; + } +} + +static int transaction_is_destructive(Transaction *tr, JobMode mode, sd_bus_error *e) { + Iterator i; + Job *j; + + assert(tr); + + /* Checks whether applying this transaction means that + * existing jobs would be replaced */ + + HASHMAP_FOREACH(j, tr->jobs, i) { + + /* Assume merged */ + assert(!j->transaction_prev); + assert(!j->transaction_next); + + if (j->unit->job && (mode == JOB_FAIL || j->unit->job->irreversible) && + job_type_is_conflicting(j->unit->job->type, j->type)) + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE, + "Transaction for %s/%s is destructive (%s has '%s' job queued, but '%s' is included in transaction).", + tr->anchor_job->unit->id, job_type_to_string(tr->anchor_job->type), + j->unit->id, job_type_to_string(j->unit->job->type), job_type_to_string(j->type)); + } + + return 0; +} + +static void transaction_minimize_impact(Transaction *tr) { + Job *j; + Iterator i; + + assert(tr); + + /* Drops all unnecessary jobs that reverse already active jobs + * or that stop a running service. */ + +rescan: + HASHMAP_FOREACH(j, tr->jobs, i) { + LIST_FOREACH(transaction, j, j) { + bool stops_running_service, changes_existing_job; + + /* If it matters, we shouldn't drop it */ + if (j->matters_to_anchor) + continue; + + /* Would this stop a running service? + * Would this change an existing job? + * If so, let's drop this entry */ + + stops_running_service = + j->type == JOB_STOP && UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(j->unit)); + + changes_existing_job = + j->unit->job && + job_type_is_conflicting(j->type, j->unit->job->type); + + if (!stops_running_service && !changes_existing_job) + continue; + + if (stops_running_service) + log_unit_debug(j->unit, + "%s/%s would stop a running service.", + j->unit->id, job_type_to_string(j->type)); + + if (changes_existing_job) + log_unit_debug(j->unit, + "%s/%s would change existing job.", + j->unit->id, job_type_to_string(j->type)); + + /* Ok, let's get rid of this */ + log_unit_debug(j->unit, + "Deleting %s/%s to minimize impact.", + j->unit->id, job_type_to_string(j->type)); + + transaction_delete_job(tr, j, true); + goto rescan; + } + } +} + +static int transaction_apply(Transaction *tr, Manager *m, JobMode mode) { + Iterator i; + Job *j; + int r; + + /* Moves the transaction jobs to the set of active jobs */ + + if (IN_SET(mode, JOB_ISOLATE, JOB_FLUSH)) { + + /* When isolating first kill all installed jobs which + * aren't part of the new transaction */ + HASHMAP_FOREACH(j, m->jobs, i) { + assert(j->installed); + + if (j->unit->ignore_on_isolate) + continue; + + if (hashmap_get(tr->jobs, j->unit)) + continue; + + /* Not invalidating recursively. Avoids triggering + * OnFailure= actions of dependent jobs. Also avoids + * invalidating our iterator. */ + job_finish_and_invalidate(j, JOB_CANCELED, false, false); + } + } + + HASHMAP_FOREACH(j, tr->jobs, i) { + /* Assume merged */ + assert(!j->transaction_prev); + assert(!j->transaction_next); + + r = hashmap_put(m->jobs, UINT32_TO_PTR(j->id), j); + if (r < 0) + goto rollback; + } + + while ((j = hashmap_steal_first(tr->jobs))) { + Job *installed_job; + + /* Clean the job dependencies */ + transaction_unlink_job(tr, j, false); + + installed_job = job_install(j); + if (installed_job != j) { + /* j has been merged into a previously installed job */ + if (tr->anchor_job == j) + tr->anchor_job = installed_job; + hashmap_remove(m->jobs, UINT32_TO_PTR(j->id)); + job_free(j); + j = installed_job; + } + + job_add_to_run_queue(j); + job_add_to_dbus_queue(j); + job_start_timer(j, false); + job_shutdown_magic(j); + } + + return 0; + +rollback: + + HASHMAP_FOREACH(j, tr->jobs, i) + hashmap_remove(m->jobs, UINT32_TO_PTR(j->id)); + + return r; +} + +int transaction_activate(Transaction *tr, Manager *m, JobMode mode, sd_bus_error *e) { + Iterator i; + Job *j; + int r; + unsigned generation = 1; + + assert(tr); + + /* This applies the changes recorded in tr->jobs to + * the actual list of jobs, if possible. */ + + /* Reset the generation counter of all installed jobs. The detection of cycles + * looks at installed jobs. If they had a non-zero generation from some previous + * walk of the graph, the algorithm would break. */ + HASHMAP_FOREACH(j, m->jobs, i) + j->generation = 0; + + /* First step: figure out which jobs matter */ + transaction_find_jobs_that_matter_to_anchor(tr->anchor_job, generation++); + + /* Second step: Try not to stop any running services if + * we don't have to. Don't try to reverse running + * jobs if we don't have to. */ + if (mode == JOB_FAIL) + transaction_minimize_impact(tr); + + /* Third step: Drop redundant jobs */ + transaction_drop_redundant(tr); + + for (;;) { + /* Fourth step: Let's remove unneeded jobs that might + * be lurking. */ + if (mode != JOB_ISOLATE) + transaction_collect_garbage(tr); + + /* Fifth step: verify order makes sense and correct + * cycles if necessary and possible */ + r = transaction_verify_order(tr, &generation, e); + if (r >= 0) + break; + + if (r != -EAGAIN) + return log_warning_errno(r, "Requested transaction contains an unfixable cyclic ordering dependency: %s", bus_error_message(e, r)); + + /* Let's see if the resulting transaction ordering + * graph is still cyclic... */ + } + + for (;;) { + /* Sixth step: let's drop unmergeable entries if + * necessary and possible, merge entries we can + * merge */ + r = transaction_merge_jobs(tr, e); + if (r >= 0) + break; + + if (r != -EAGAIN) + return log_warning_errno(r, "Requested transaction contains unmergeable jobs: %s", bus_error_message(e, r)); + + /* Seventh step: an entry got dropped, let's garbage + * collect its dependencies. */ + if (mode != JOB_ISOLATE) + transaction_collect_garbage(tr); + + /* Let's see if the resulting transaction still has + * unmergeable entries ... */ + } + + /* Eights step: Drop redundant jobs again, if the merging now allows us to drop more. */ + transaction_drop_redundant(tr); + + /* Ninth step: check whether we can actually apply this */ + r = transaction_is_destructive(tr, mode, e); + if (r < 0) + return log_notice_errno(r, "Requested transaction contradicts existing jobs: %s", bus_error_message(e, r)); + + /* Tenth step: apply changes */ + r = transaction_apply(tr, m, mode); + if (r < 0) + return log_warning_errno(r, "Failed to apply transaction: %m"); + + assert(hashmap_isempty(tr->jobs)); + + if (!hashmap_isempty(m->jobs)) { + /* Are there any jobs now? Then make sure we have the + * idle pipe around. We don't really care too much + * whether this works or not, as the idle pipe is a + * feature for cosmetics, not actually useful for + * anything beyond that. */ + + if (m->idle_pipe[0] < 0 && m->idle_pipe[1] < 0 && + m->idle_pipe[2] < 0 && m->idle_pipe[3] < 0) { + (void) pipe2(m->idle_pipe, O_NONBLOCK|O_CLOEXEC); + (void) pipe2(m->idle_pipe + 2, O_NONBLOCK|O_CLOEXEC); + } + } + + return 0; +} + +static Job* transaction_add_one_job(Transaction *tr, JobType type, Unit *unit, bool *is_new) { + Job *j, *f; + + assert(tr); + assert(unit); + + /* Looks for an existing prospective job and returns that. If + * it doesn't exist it is created and added to the prospective + * jobs list. */ + + f = hashmap_get(tr->jobs, unit); + + LIST_FOREACH(transaction, j, f) { + assert(j->unit == unit); + + if (j->type == type) { + if (is_new) + *is_new = false; + return j; + } + } + + j = job_new(unit, type); + if (!j) + return NULL; + + j->generation = 0; + j->marker = NULL; + j->matters_to_anchor = false; + j->irreversible = tr->irreversible; + + LIST_PREPEND(transaction, f, j); + + if (hashmap_replace(tr->jobs, unit, f) < 0) { + LIST_REMOVE(transaction, f, j); + job_free(j); + return NULL; + } + + if (is_new) + *is_new = true; + + log_trace("Added job %s/%s to transaction.", unit->id, job_type_to_string(type)); + + return j; +} + +static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies) { + assert(tr); + assert(j); + + if (j->transaction_prev) + j->transaction_prev->transaction_next = j->transaction_next; + else if (j->transaction_next) + hashmap_replace(tr->jobs, j->unit, j->transaction_next); + else + hashmap_remove_value(tr->jobs, j->unit, j); + + if (j->transaction_next) + j->transaction_next->transaction_prev = j->transaction_prev; + + j->transaction_prev = j->transaction_next = NULL; + + while (j->subject_list) + job_dependency_free(j->subject_list); + + while (j->object_list) { + Job *other = j->object_list->matters ? j->object_list->subject : NULL; + + job_dependency_free(j->object_list); + + if (other && delete_dependencies) { + log_unit_debug(other->unit, + "Deleting job %s/%s as dependency of job %s/%s", + other->unit->id, job_type_to_string(other->type), + j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, other, delete_dependencies); + } + } +} + +void transaction_add_propagate_reload_jobs(Transaction *tr, Unit *unit, Job *by, bool ignore_order, sd_bus_error *e) { + Iterator i; + JobType nt; + Unit *dep; + void *v; + int r; + + assert(tr); + assert(unit); + + HASHMAP_FOREACH_KEY(v, dep, unit->dependencies[UNIT_PROPAGATES_RELOAD_TO], i) { + nt = job_type_collapse(JOB_TRY_RELOAD, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, by, false, false, false, ignore_order, e); + if (r < 0) { + log_unit_warning(dep, + "Cannot add dependency reload job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } +} + +int transaction_add_job_and_dependencies( + Transaction *tr, + JobType type, + Unit *unit, + Job *by, + bool matters, + bool conflicts, + bool ignore_requirements, + bool ignore_order, + sd_bus_error *e) { + + bool is_new; + Iterator i; + Unit *dep; + Job *ret; + void *v; + int r; + + assert(tr); + assert(type < _JOB_TYPE_MAX); + assert(type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(unit); + + /* Before adding jobs for this unit, let's ensure that its state has been loaded + * This matters when jobs are spawned as part of coldplugging itself (see e. g. path_coldplug()). + * This way, we "recursively" coldplug units, ensuring that we do not look at state of + * not-yet-coldplugged units. */ + if (MANAGER_IS_RELOADING(unit->manager)) + unit_coldplug(unit); + + if (by) + log_trace("Pulling in %s/%s from %s/%s", unit->id, job_type_to_string(type), by->unit->id, job_type_to_string(by->type)); + + /* Safety check that the unit is a valid state, i.e. not in UNIT_STUB or UNIT_MERGED which should only be set + * temporarily. */ + if (!IN_SET(unit->load_state, UNIT_LOADED, UNIT_ERROR, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_MASKED)) + return sd_bus_error_setf(e, BUS_ERROR_LOAD_FAILED, "Unit %s is not loaded properly.", unit->id); + + if (type != JOB_STOP) { + r = bus_unit_validate_load_state(unit, e); + if (r < 0) + return r; + } + + if (!unit_job_is_applicable(unit, type)) + return sd_bus_error_setf(e, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE, + "Job type %s is not applicable for unit %s.", + job_type_to_string(type), unit->id); + + /* First add the job. */ + ret = transaction_add_one_job(tr, type, unit, &is_new); + if (!ret) + return -ENOMEM; + + ret->ignore_order = ret->ignore_order || ignore_order; + + /* Then, add a link to the job. */ + if (by) { + if (!job_dependency_new(by, ret, matters, conflicts)) + return -ENOMEM; + } else { + /* If the job has no parent job, it is the anchor job. */ + assert(!tr->anchor_job); + tr->anchor_job = ret; + } + + if (is_new && !ignore_requirements && type != JOB_NOP) { + Set *following; + + /* If we are following some other unit, make sure we + * add all dependencies of everybody following. */ + if (unit_following_set(ret->unit, &following) > 0) { + SET_FOREACH(dep, following, i) { + r = transaction_add_job_and_dependencies(tr, type, dep, ret, false, false, false, ignore_order, e); + if (r < 0) { + log_unit_full(dep, + r == -ERFKILL ? LOG_INFO : LOG_WARNING, + r, "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + set_free(following); + } + + /* Finally, recursively add in all dependencies. */ + if (IN_SET(type, JOB_START, JOB_RESTART)) { + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_REQUIRES], i) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_BINDS_TO], i) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_WANTS], i) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, false, false, false, ignore_order, e); + if (r < 0) { + /* unit masked, job type not applicable and unit not found are not considered as errors. */ + log_unit_full(dep, + IN_SET(r, -ERFKILL, -EBADR, -ENOENT) ? LOG_DEBUG : LOG_WARNING, + r, "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_REQUISITE], i) { + r = transaction_add_job_and_dependencies(tr, JOB_VERIFY_ACTIVE, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_CONFLICTS], i) { + r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, true, true, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_CONFLICTED_BY], i) { + r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, false, false, false, ignore_order, e); + if (r < 0) { + log_unit_warning(dep, + "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + } + + if (IN_SET(type, JOB_STOP, JOB_RESTART)) { + static const UnitDependency propagate_deps[] = { + UNIT_REQUIRED_BY, + UNIT_REQUISITE_OF, + UNIT_BOUND_BY, + UNIT_CONSISTS_OF, + }; + + JobType ptype; + unsigned j; + + /* We propagate STOP as STOP, but RESTART only + * as TRY_RESTART, in order not to start + * dependencies that are not around. */ + ptype = type == JOB_RESTART ? JOB_TRY_RESTART : type; + + for (j = 0; j < ELEMENTSOF(propagate_deps); j++) + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[propagate_deps[j]], i) { + JobType nt; + + nt = job_type_collapse(ptype, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + } + + if (type == JOB_RELOAD) + transaction_add_propagate_reload_jobs(tr, ret->unit, ret, ignore_order, e); + + /* JOB_VERIFY_ACTIVE requires no dependency handling */ + } + + return 0; + +fail: + return r; +} + +int transaction_add_isolate_jobs(Transaction *tr, Manager *m) { + Iterator i; + Unit *u; + char *k; + int r; + + assert(tr); + assert(m); + + HASHMAP_FOREACH_KEY(u, k, m->units, i) { + + /* ignore aliases */ + if (u->id != k) + continue; + + if (u->ignore_on_isolate) + continue; + + /* No need to stop inactive jobs */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) && !u->job) + continue; + + /* Is there already something listed for this? */ + if (hashmap_get(tr->jobs, u)) + continue; + + r = transaction_add_job_and_dependencies(tr, JOB_STOP, u, tr->anchor_job, true, false, false, false, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add isolate job, ignoring: %m"); + } + + return 0; +} + +Transaction *transaction_new(bool irreversible) { + Transaction *tr; + + tr = new0(Transaction, 1); + if (!tr) + return NULL; + + tr->jobs = hashmap_new(NULL); + if (!tr->jobs) + return mfree(tr); + + tr->irreversible = irreversible; + + return tr; +} + +void transaction_free(Transaction *tr) { + assert(hashmap_isempty(tr->jobs)); + hashmap_free(tr->jobs); + free(tr); +} diff --git a/src/core/transaction.h b/src/core/transaction.h new file mode 100644 index 0000000..70d74a4 --- /dev/null +++ b/src/core/transaction.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +typedef struct Transaction Transaction; + +#include "hashmap.h" +#include "job.h" +#include "manager.h" +#include "unit.h" + +struct Transaction { + /* Jobs to be added */ + Hashmap *jobs; /* Unit object => Job object list 1:1 */ + Job *anchor_job; /* the job the user asked for */ + bool irreversible; +}; + +Transaction *transaction_new(bool irreversible); +void transaction_free(Transaction *tr); + +void transaction_add_propagate_reload_jobs(Transaction *tr, Unit *unit, Job *by, bool ignore_order, sd_bus_error *e); +int transaction_add_job_and_dependencies( + Transaction *tr, + JobType type, + Unit *unit, + Job *by, + bool matters, + bool conflicts, + bool ignore_requirements, + bool ignore_order, + sd_bus_error *e); +int transaction_activate(Transaction *tr, Manager *m, JobMode mode, sd_bus_error *e); +int transaction_add_isolate_jobs(Transaction *tr, Manager *m); +void transaction_abort(Transaction *tr); diff --git a/src/core/triggers.systemd.in b/src/core/triggers.systemd.in new file mode 100644 index 0000000..10b1889 --- /dev/null +++ b/src/core/triggers.systemd.in @@ -0,0 +1,143 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. +# Copyright © 2018 Neal Gompa + +# The contents of this are an example to be copied into systemd.spec. +# +# Minimum rpm version supported: 4.13.0 + +%transfiletriggerin -P 900900 -p <lua> -- @systemunitdir@ /etc/systemd/system +-- This script will run after any package is initially installed or +-- upgraded. We care about the case where a package is initially +-- installed, because other cases are covered by the *un scriptlets, +-- so sometimes we will reload needlessly. + +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemctl", "daemon-reload")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerun -p <lua> -- @systemunitdir@ /etc/systemd/system +-- On removal, we need to run daemon-reload after any units have been +-- removed. %transfiletriggerpostun would be ideal, but it does not get +-- executed for some reason. +-- On upgrade, we need to run daemon-reload after any new unit files +-- have been installed, but before %postun scripts in packages get +-- executed. %transfiletriggerun gets the right list of files +-- but it is invoked too early (before changes happen). +-- %filetriggerpostun happens at the right time, but it fires for +-- every package. +-- To execute the reload at the right time, we create a state +-- file in %transfiletriggerun and execute the daemon-reload in +-- the first %filetriggerpostun. + +if posix.access("/run/systemd/system") then + posix.mkdir("%{_localstatedir}/lib") + posix.mkdir("%{_localstatedir}/lib/rpm-state") + posix.mkdir("%{_localstatedir}/lib/rpm-state/systemd") + io.open("%{_localstatedir}/lib/rpm-state/systemd/needs-reload", "w") +end + +%filetriggerpostun -P 1000100 -p <lua> -- @systemunitdir@ /etc/systemd/system +if posix.access("%{_localstatedir}/lib/rpm-state/systemd/needs-reload") then + posix.unlink("%{_localstatedir}/lib/rpm-state/systemd/needs-reload") + posix.rmdir("%{_localstatedir}/lib/rpm-state/systemd") + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemctl", "daemon-reload")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -P 100700 -p <lua> -- @sysusersdir@ +-- This script will process files installed in @sysusersdir@ to create +-- specified users automatically. The priority is set such that it +-- will run before the tmpfiles file trigger. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemd-sysusers")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -P 100500 -p <lua> -- @tmpfilesdir@ +-- This script will process files installed in @tmpfilesdir@ to create +-- tmpfiles automatically. The priority is set such that it will run +-- after the sysusers file trigger, but before any other triggers. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemd-tmpfiles", "--create")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @udevhwdbdir@ +-- This script will automatically invoke hwdb update if files have been +-- installed or updated in @udevhwdbdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemd-hwdb", "update")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @catalogdir@ +-- This script will automatically invoke journal catalog update if files +-- have been installed or updated in @catalogdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/journalctl", "--update-catalog")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @udevrulesdir@ +-- This script will automatically update udev with new rules if files +-- have been installed or updated in @udevrulesdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/udevadm", "control", "--reload")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @sysctldir@ +-- This script will automatically apply sysctl rules if files have been +-- installed or updated in @sysctldir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("@rootlibexecdir@/systemd-sysctl")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @binfmtdir@ +-- This script will automatically apply binfmt rules if files have been +-- installed or updated in @binfmtdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("@rootlibexecdir@/systemd-binfmt")) + elseif pid > 0 then + posix.wait(pid) + end +end diff --git a/src/core/umount.c b/src/core/umount.c new file mode 100644 index 0000000..7af0195 --- /dev/null +++ b/src/core/umount.c @@ -0,0 +1,699 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include <errno.h> +#include <fcntl.h> +#include <linux/loop.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/swap.h> + +/* This needs to be after sys/mount.h :( */ +#include <libmount.h> + +#include "sd-device.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "def.h" +#include "device-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fstab-util.h" +#include "linux-3.13/dm-ioctl.h" +#include "mount-setup.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "umount.h" +#include "util.h" +#include "virt.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct libmnt_table*, mnt_free_table); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct libmnt_iter*, mnt_free_iter); + +static void mount_point_free(MountPoint **head, MountPoint *m) { + assert(head); + assert(m); + + LIST_REMOVE(mount_point, *head, m); + + free(m->path); + free(m->remount_options); + free(m); +} + +void mount_points_list_free(MountPoint **head) { + assert(head); + + while (*head) + mount_point_free(head, *head); +} + +int mount_points_list_get(const char *mountinfo, MountPoint **head) { + _cleanup_(mnt_free_tablep) struct libmnt_table *t = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *i = NULL; + int r; + + assert(head); + + t = mnt_new_table(); + i = mnt_new_iter(MNT_ITER_FORWARD); + if (!t || !i) + return log_oom(); + + r = mnt_table_parse_mtab(t, mountinfo); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", mountinfo); + + for (;;) { + struct libmnt_fs *fs; + const char *path, *fstype; + _cleanup_free_ char *options = NULL; + _cleanup_free_ char *p = NULL; + unsigned long remount_flags = 0u; + _cleanup_free_ char *remount_options = NULL; + bool try_remount_ro; + MountPoint *m; + + r = mnt_table_next_fs(t, i, &fs); + if (r == 1) + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from %s: %m", mountinfo); + + path = mnt_fs_get_target(fs); + if (!path) + continue; + + if (cunescape(path, UNESCAPE_RELAX, &p) < 0) + return log_oom(); + + fstype = mnt_fs_get_fstype(fs); + + /* Combine the generic VFS options with the FS-specific + * options. Duplicates are not a problem here, because the only + * options that should come up twice are typically ro/rw, which + * are turned into MS_RDONLY or the invertion of it. + * + * Even if there are duplicates later in mount_option_mangle() + * it shouldn't hurt anyways as they override each other. + */ + if (!strextend_with_separator(&options, ",", + mnt_fs_get_vfs_options(fs), + NULL)) + return log_oom(); + if (!strextend_with_separator(&options, ",", + mnt_fs_get_fs_options(fs), + NULL)) + return log_oom(); + + /* Ignore mount points we can't unmount because they + * are API or because we are keeping them open (like + * /dev/console). Also, ignore all mounts below API + * file systems, since they are likely virtual too, + * and hence not worth spending time on. Also, in + * unprivileged containers we might lack the rights to + * unmount these things, hence don't bother. */ + if (mount_point_is_api(p) || + mount_point_ignore(p) || + PATH_STARTSWITH_SET(p, "/dev", "/sys", "/proc")) + continue; + + /* If we are in a container, don't attempt to + * read-only mount anything as that brings no real + * benefits, but might confuse the host, as we remount + * the superblock here, not the bind mount. + * + * If the filesystem is a network fs, also skip the + * remount. It brings no value (we cannot leave + * a "dirty fs") and could hang if the network is down. + * Note that umount2() is more careful and will not + * hang because of the network being down. */ + try_remount_ro = detect_container() <= 0 && + !fstype_is_network(fstype) && + !fstype_is_api_vfs(fstype) && + !fstype_is_ro(fstype) && + !fstab_test_yes_no_option(options, "ro\0rw\0"); + + if (try_remount_ro) { + /* mount(2) states that mount flags and options need to be exactly the same + * as they were when the filesystem was mounted, except for the desired + * changes. So we reconstruct both here and adjust them for the later + * remount call too. */ + + r = mnt_fs_get_propagation(fs, &remount_flags); + if (r < 0) { + log_warning_errno(r, "mnt_fs_get_propagation() failed for %s, ignoring: %m", path); + continue; + } + + r = mount_option_mangle(options, remount_flags, &remount_flags, &remount_options); + if (r < 0) { + log_warning_errno(r, "mount_option_mangle failed for %s, ignoring: %m", path); + continue; + } + + /* MS_BIND is special. If it is provided it will only make the mount-point + * read-only. If left out, the super block itself is remounted, which we want. */ + remount_flags = (remount_flags|MS_REMOUNT|MS_RDONLY) & ~MS_BIND; + } + + m = new0(MountPoint, 1); + if (!m) + return log_oom(); + + free_and_replace(m->path, p); + free_and_replace(m->remount_options, remount_options); + m->remount_flags = remount_flags; + m->try_remount_ro = try_remount_ro; + + LIST_PREPEND(mount_point, *head, m); + } + + return 0; +} + +int swap_list_get(const char *swaps, MountPoint **head) { + _cleanup_(mnt_free_tablep) struct libmnt_table *t = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *i = NULL; + int r; + + assert(head); + + t = mnt_new_table(); + i = mnt_new_iter(MNT_ITER_FORWARD); + if (!t || !i) + return log_oom(); + + r = mnt_table_parse_swaps(t, swaps); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", swaps); + + for (;;) { + struct libmnt_fs *fs; + + MountPoint *swap; + const char *source; + _cleanup_free_ char *d = NULL; + + r = mnt_table_next_fs(t, i, &fs); + if (r == 1) + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from %s: %m", swaps); + + source = mnt_fs_get_source(fs); + if (!source) + continue; + + r = cunescape(source, UNESCAPE_RELAX, &d); + if (r < 0) + return r; + + swap = new0(MountPoint, 1); + if (!swap) + return -ENOMEM; + + free_and_replace(swap->path, d); + LIST_PREPEND(mount_point, *head, swap); + } + + return 0; +} + +static int loopback_list_get(MountPoint **head) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *d; + int r; + + assert(head); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "block", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysname(e, "loop*"); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, "loop/backing_file", NULL, true); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + _cleanup_free_ char *p = NULL; + const char *dn; + MountPoint *lb; + + if (sd_device_get_devname(d, &dn) < 0) + continue; + + p = strdup(dn); + if (!p) + return -ENOMEM; + + lb = new(MountPoint, 1); + if (!lb) + return -ENOMEM; + + *lb = (MountPoint) { + .path = TAKE_PTR(p), + }; + + LIST_PREPEND(mount_point, *head, lb); + } + + return 0; +} + +static int dm_list_get(MountPoint **head) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *d; + int r; + + assert(head); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "block", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysname(e, "dm-*"); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + _cleanup_free_ char *p = NULL; + const char *dn; + MountPoint *m; + dev_t devnum; + + if (sd_device_get_devnum(d, &devnum) < 0 || + sd_device_get_devname(d, &dn) < 0) + continue; + + p = strdup(dn); + if (!p) + return -ENOMEM; + + m = new(MountPoint, 1); + if (!m) + return -ENOMEM; + + *m = (MountPoint) { + .path = TAKE_PTR(p), + .devnum = devnum, + }; + + LIST_PREPEND(mount_point, *head, m); + } + + return 0; +} + +static int delete_loopback(const char *device) { + _cleanup_close_ int fd = -1; + int r; + + assert(device); + + fd = open(device, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return errno == ENOENT ? 0 : -errno; + + r = ioctl(fd, LOOP_CLR_FD, 0); + if (r >= 0) + return 1; + + /* ENXIO: not bound, so no error */ + if (errno == ENXIO) + return 0; + + return -errno; +} + +static int delete_dm(dev_t devnum) { + + struct dm_ioctl dm = { + .version = { + DM_VERSION_MAJOR, + DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL + }, + .data_size = sizeof(dm), + .dev = devnum, + }; + + _cleanup_close_ int fd = -1; + + assert(major(devnum) != 0); + + fd = open("/dev/mapper/control", O_RDWR|O_CLOEXEC); + if (fd < 0) + return -errno; + + if (ioctl(fd, DM_DEV_REMOVE, &dm) < 0) + return -errno; + + return 0; +} + +static bool nonunmountable_path(const char *path) { + return path_equal(path, "/") +#if ! HAVE_SPLIT_USR + || path_equal(path, "/usr") +#endif + || path_startswith(path, "/run/initramfs"); +} + +static int remount_with_timeout(MountPoint *m, int umount_log_level) { + pid_t pid; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + assert(m); + + /* Due to the possiblity of a remount operation hanging, we + * fork a child process and set a timeout. If the timeout + * lapses, the assumption is that that particular remount + * failed. */ + r = safe_fork("(sd-remount)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_REOPEN_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + log_info("Remounting '%s' read-only in with options '%s'.", m->path, m->remount_options); + + /* Start the mount operation here in the child */ + r = mount(NULL, m->path, NULL, m->remount_flags, m->remount_options); + if (r < 0) + log_full_errno(umount_log_level, errno, "Failed to remount '%s' read-only: %m", m->path); + + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC); + if (r == -ETIMEDOUT) { + log_error_errno(r, "Remounting '%s' timed out, issuing SIGKILL to PID " PID_FMT ".", m->path, pid); + (void) kill(pid, SIGKILL); + } else if (r == -EPROTO) + log_debug_errno(r, "Remounting '%s' failed abnormally, child process " PID_FMT " aborted or exited non-zero.", m->path, pid); + else if (r < 0) + log_error_errno(r, "Remounting '%s' failed unexpectedly, couldn't wait for child process " PID_FMT ": %m", m->path, pid); + + return r; +} + +static int umount_with_timeout(MountPoint *m, int umount_log_level) { + pid_t pid; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + assert(m); + + /* Due to the possiblity of a umount operation hanging, we + * fork a child process and set a timeout. If the timeout + * lapses, the assumption is that that particular umount + * failed. */ + r = safe_fork("(sd-umount)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_REOPEN_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + log_info("Unmounting '%s'.", m->path); + + /* Start the mount operation here in the child Using MNT_FORCE + * causes some filesystems (e.g. FUSE and NFS and other network + * filesystems) to abort any pending requests and return -EIO + * rather than blocking indefinitely. If the filesysten is + * "busy", this may allow processes to die, thus making the + * filesystem less busy so the unmount might succeed (rather + * then return EBUSY).*/ + r = umount2(m->path, MNT_FORCE); + if (r < 0) + log_full_errno(umount_log_level, errno, "Failed to unmount %s: %m", m->path); + + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC); + if (r == -ETIMEDOUT) { + log_error_errno(r, "Unmounting '%s' timed out, issuing SIGKILL to PID " PID_FMT ".", m->path, pid); + (void) kill(pid, SIGKILL); + } else if (r == -EPROTO) + log_debug_errno(r, "Unmounting '%s' failed abnormally, child process " PID_FMT " aborted or exited non-zero.", m->path, pid); + else if (r < 0) + log_error_errno(r, "Unmounting '%s' failed unexpectedly, couldn't wait for child process " PID_FMT ": %m", m->path, pid); + + return r; +} + +/* This includes remounting readonly, which changes the kernel mount options. + * Therefore the list passed to this function is invalidated, and should not be reused. */ +static int mount_points_list_umount(MountPoint **head, bool *changed, int umount_log_level) { + MountPoint *m; + int n_failed = 0; + + assert(head); + assert(changed); + + LIST_FOREACH(mount_point, m, *head) { + if (m->try_remount_ro) { + /* We always try to remount directories + * read-only first, before we go on and umount + * them. + * + * Mount points can be stacked. If a mount + * point is stacked below / or /usr, we + * cannot umount or remount it directly, + * since there is no way to refer to the + * underlying mount. There's nothing we can do + * about it for the general case, but we can + * do something about it if it is aliased + * somehwere else via a bind mount. If we + * explicitly remount the super block of that + * alias read-only we hence should be + * relatively safe regarding keeping a dirty fs + * we cannot otherwise see. + * + * Since the remount can hang in the instance of + * remote filesystems, we remount asynchronously + * and skip the subsequent umount if it fails. */ + if (remount_with_timeout(m, umount_log_level) < 0) { + /* Remount failed, but try unmounting anyway, + * unless this is a mount point we want to skip. */ + if (nonunmountable_path(m->path)) { + n_failed++; + continue; + } + } + } + + /* Skip / and /usr since we cannot unmount that + * anyway, since we are running from it. They have + * already been remounted ro. */ + if (nonunmountable_path(m->path)) + continue; + + /* Trying to umount */ + if (umount_with_timeout(m, umount_log_level) < 0) + n_failed++; + else + *changed = true; + } + + return n_failed; +} + +static int swap_points_list_off(MountPoint **head, bool *changed) { + MountPoint *m, *n; + int n_failed = 0; + + assert(head); + assert(changed); + + LIST_FOREACH_SAFE(mount_point, m, n, *head) { + log_info("Deactivating swap %s.", m->path); + if (swapoff(m->path) == 0) { + *changed = true; + mount_point_free(head, m); + } else { + log_warning_errno(errno, "Could not deactivate swap %s: %m", m->path); + n_failed++; + } + } + + return n_failed; +} + +static int loopback_points_list_detach(MountPoint **head, bool *changed, int umount_log_level) { + MountPoint *m, *n; + int n_failed = 0, k; + struct stat root_st; + + assert(head); + assert(changed); + + k = lstat("/", &root_st); + + LIST_FOREACH_SAFE(mount_point, m, n, *head) { + int r; + struct stat loopback_st; + + if (k >= 0 && + major(root_st.st_dev) != 0 && + lstat(m->path, &loopback_st) >= 0 && + root_st.st_dev == loopback_st.st_rdev) { + n_failed++; + continue; + } + + log_info("Detaching loopback %s.", m->path); + r = delete_loopback(m->path); + if (r >= 0) { + if (r > 0) + *changed = true; + + mount_point_free(head, m); + } else { + log_full_errno(umount_log_level, errno, "Could not detach loopback %s: %m", m->path); + n_failed++; + } + } + + return n_failed; +} + +static int dm_points_list_detach(MountPoint **head, bool *changed, int umount_log_level) { + MountPoint *m, *n; + int n_failed = 0, r; + dev_t rootdev; + + assert(head); + assert(changed); + + r = get_block_device("/", &rootdev); + if (r <= 0) + rootdev = 0; + + LIST_FOREACH_SAFE(mount_point, m, n, *head) { + + if (major(rootdev) != 0 && rootdev == m->devnum) { + n_failed ++; + continue; + } + + log_info("Detaching DM %u:%u.", major(m->devnum), minor(m->devnum)); + r = delete_dm(m->devnum); + if (r >= 0) { + *changed = true; + mount_point_free(head, m); + } else { + log_full_errno(umount_log_level, errno, "Could not detach DM %s: %m", m->path); + n_failed++; + } + } + + return n_failed; +} + +static int umount_all_once(bool *changed, int umount_log_level) { + int r; + _cleanup_(mount_points_list_free) LIST_HEAD(MountPoint, mp_list_head); + + assert(changed); + + LIST_HEAD_INIT(mp_list_head); + r = mount_points_list_get(NULL, &mp_list_head); + if (r < 0) + return r; + + return mount_points_list_umount(&mp_list_head, changed, umount_log_level); +} + +int umount_all(bool *changed, int umount_log_level) { + bool umount_changed; + int r; + + assert(changed); + + /* Retry umount, until nothing can be umounted anymore. Mounts are + * processed in order, newest first. The retries are needed when + * an old mount has been moved, to a path inside a newer mount. */ + do { + umount_changed = false; + + r = umount_all_once(&umount_changed, umount_log_level); + if (umount_changed) + *changed = true; + } while (umount_changed); + + return r; +} + +int swapoff_all(bool *changed) { + _cleanup_(mount_points_list_free) LIST_HEAD(MountPoint, swap_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(swap_list_head); + + r = swap_list_get(NULL, &swap_list_head); + if (r < 0) + return r; + + return swap_points_list_off(&swap_list_head, changed); +} + +int loopback_detach_all(bool *changed, int umount_log_level) { + _cleanup_(mount_points_list_free) LIST_HEAD(MountPoint, loopback_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(loopback_list_head); + + r = loopback_list_get(&loopback_list_head); + if (r < 0) + return r; + + return loopback_points_list_detach(&loopback_list_head, changed, umount_log_level); +} + +int dm_detach_all(bool *changed, int umount_log_level) { + _cleanup_(mount_points_list_free) LIST_HEAD(MountPoint, dm_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(dm_list_head); + + r = dm_list_get(&dm_list_head); + if (r < 0) + return r; + + return dm_points_list_detach(&dm_list_head, changed, umount_log_level); +} diff --git a/src/core/umount.h b/src/core/umount.h new file mode 100644 index 0000000..6f2b24d --- /dev/null +++ b/src/core/umount.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include "list.h" + +int umount_all(bool *changed, int umount_log_level); + +int swapoff_all(bool *changed); + +int loopback_detach_all(bool *changed, int umount_log_level); + +int dm_detach_all(bool *changed, int umount_log_level); + +/* This is exported just for testing */ +typedef struct MountPoint { + char *path; + char *remount_options; + unsigned long remount_flags; + bool try_remount_ro; + dev_t devnum; + LIST_FIELDS(struct MountPoint, mount_point); +} MountPoint; + +int mount_points_list_get(const char *mountinfo, MountPoint **head); +void mount_points_list_free(MountPoint **head); +int swap_list_get(const char *swaps, MountPoint **head); diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c new file mode 100644 index 0000000..72391ac --- /dev/null +++ b/src/core/unit-printf.c @@ -0,0 +1,289 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "format-util.h" +#include "macro.h" +#include "specifier.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "unit.h" +#include "user-util.h" + +static int specifier_prefix_and_instance(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + return unit_name_to_prefix_and_instance(u->id, ret); +} + +static int specifier_prefix(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + return unit_name_to_prefix(u->id, ret); +} + +static int specifier_prefix_unescaped(char specifier, const void *data, const void *userdata, char **ret) { + _cleanup_free_ char *p = NULL; + const Unit *u = userdata; + int r; + + assert(u); + + r = unit_name_to_prefix(u->id, &p); + if (r < 0) + return r; + + return unit_name_unescape(p, ret); +} + +static int specifier_instance_unescaped(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + return unit_name_unescape(strempty(u->instance), ret); +} + +static int specifier_last_component(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + _cleanup_free_ char *prefix = NULL; + char *dash; + int r; + + assert(u); + + r = unit_name_to_prefix(u->id, &prefix); + if (r < 0) + return r; + + dash = strrchr(prefix, '-'); + if (dash) + return specifier_string(specifier, dash + 1, userdata, ret); + + *ret = TAKE_PTR(prefix); + return 0; +} + +static int specifier_last_component_unescaped(char specifier, const void *data, const void *userdata, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + r = specifier_last_component(specifier, data, userdata, &p); + if (r < 0) + return r; + + return unit_name_unescape(p, ret); +} + +static int specifier_filename(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + if (u->instance) + return unit_name_path_unescape(u->instance, ret); + else + return unit_name_to_path(u->id, ret); +} + +static void bad_specifier(const Unit *u, char specifier) { + log_unit_warning(u, "Specifier '%%%c' used in unit configuration, which is deprecated. Please update your unit file, as it does not work as intended.", specifier); +} + +static int specifier_cgroup(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n; + + assert(u); + + bad_specifier(u, specifier); + + if (u->cgroup_path) + n = strdup(u->cgroup_path); + else + n = unit_default_cgroup_path(u); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_cgroup_root(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n; + + assert(u); + + bad_specifier(u, specifier); + + n = strdup(u->manager->cgroup_root); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_cgroup_slice(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n; + + assert(u); + + bad_specifier(u, specifier); + + if (UNIT_ISSET(u->slice)) { + const Unit *slice; + + slice = UNIT_DEREF(u->slice); + + if (slice->cgroup_path) + n = strdup(slice->cgroup_path); + else + n = unit_default_cgroup_path(slice); + } else + n = strdup(u->manager->cgroup_root); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_special_directory(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n = NULL; + + assert(u); + + n = strdup(u->manager->prefix[PTR_TO_UINT(data)]); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int unit_name_printf(Unit *u, const char* format, char **ret) { + + /* + * This will use the passed string as format string and replace the following specifiers (which should all be + * safe for inclusion in unit names): + * + * %n: the full id of the unit (foo@bar.waldo) + * %N: the id of the unit without the suffix (foo@bar) + * %p: the prefix (foo) + * %i: the instance (bar) + * + * %U: the UID of the running user + * %u: the username of the running user + * + * %m: the machine ID of the running system + * %H: the host name of the running system + * %b: the boot ID of the running system + */ + + const Specifier table[] = { + { 'n', specifier_string, u->id }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + { 'i', specifier_string, u->instance }, + { 'j', specifier_last_component, NULL }, + + { 'g', specifier_group_name, NULL }, + { 'G', specifier_group_id, NULL }, + { 'U', specifier_user_id, NULL }, + { 'u', specifier_user_name, NULL }, + + { 'm', specifier_machine_id, NULL }, + { 'H', specifier_host_name, NULL }, + { 'b', specifier_boot_id, NULL }, + {} + }; + + assert(u); + assert(format); + assert(ret); + + return specifier_printf(format, table, u, ret); +} + +int unit_full_printf(Unit *u, const char *format, char **ret) { + + /* This is similar to unit_name_printf() but also supports unescaping. Also, adds a couple of additional codes + * (which are likely not suitable for unescaped inclusion in unit names): + * + * %f: the unescaped instance if set, otherwise the id unescaped as path + * + * %c: cgroup path of unit (deprecated) + * %r: where units in this slice are placed in the cgroup tree (deprecated) + * %R: the root of this systemd's instance tree (deprecated) + * + * %t: the runtime directory root (e.g. /run or $XDG_RUNTIME_DIR) + * %S: the state directory root (e.g. /var/lib or $XDG_CONFIG_HOME) + * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME) + * %L: the log directory root (e.g. /var/log or $XDG_CONFIG_HOME/log) + * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME) + * %T: the temporary directory (e.g. /tmp, or $TMPDIR, $TEMP, $TMP) + * %V: the temporary directory for large, persistent stuff (e.g. /var/tmp, or $TMPDIR, $TEMP, $TMP) + * + * %h: the homedir of the running user + * %s: the shell of the running user + * + * %v: `uname -r` of the running system + * + * NOTICE: When you add new entries here, please be careful: specifiers which depend on settings of the unit + * file itself are broken by design, as they would resolve differently depending on whether they are used + * before or after the relevant configuration setting. Hence: don't add them. + */ + + const Specifier table[] = { + { 'n', specifier_string, u->id }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + { 'P', specifier_prefix_unescaped, NULL }, + { 'i', specifier_string, u->instance }, + { 'I', specifier_instance_unescaped, NULL }, + { 'j', specifier_last_component, NULL }, + { 'J', specifier_last_component_unescaped, NULL }, + + { 'f', specifier_filename, NULL }, + { 'c', specifier_cgroup, NULL }, + { 'r', specifier_cgroup_slice, NULL }, + { 'R', specifier_cgroup_root, NULL }, + + { 't', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_RUNTIME) }, + { 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) }, + { 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) }, + { 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) }, + { 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) }, + { 'T', specifier_tmp_dir, NULL }, + { 'V', specifier_var_tmp_dir, NULL }, + + { 'g', specifier_group_name, NULL }, + { 'G', specifier_group_id, NULL }, + { 'U', specifier_user_id, NULL }, + { 'u', specifier_user_name, NULL }, + { 'h', specifier_user_home, NULL }, + { 's', specifier_user_shell, NULL }, + + { 'm', specifier_machine_id, NULL }, + { 'H', specifier_host_name, NULL }, + { 'b', specifier_boot_id, NULL }, + { 'v', specifier_kernel_release, NULL }, + {} + }; + + assert(u); + assert(format); + assert(ret); + + return specifier_printf(format, table, u, ret); +} diff --git a/src/core/unit-printf.h b/src/core/unit-printf.h new file mode 100644 index 0000000..f3dae15 --- /dev/null +++ b/src/core/unit-printf.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "unit.h" + +int unit_name_printf(Unit *u, const char* text, char **ret); +int unit_full_printf(Unit *u, const char *text, char **ret); diff --git a/src/core/unit.c b/src/core/unit.c new file mode 100644 index 0000000..24b14fb --- /dev/null +++ b/src/core/unit.c @@ -0,0 +1,5593 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "sd-id128.h" +#include "sd-messages.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "dropin.h" +#include "escape.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "macro.h" +#include "missing.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "set.h" +#include "signal-util.h" +#include "sparse-endian.h" +#include "special.h" +#include "specifier.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" + +const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = { + [UNIT_SERVICE] = &service_vtable, + [UNIT_SOCKET] = &socket_vtable, + [UNIT_TARGET] = &target_vtable, + [UNIT_DEVICE] = &device_vtable, + [UNIT_MOUNT] = &mount_vtable, + [UNIT_AUTOMOUNT] = &automount_vtable, + [UNIT_SWAP] = &swap_vtable, + [UNIT_TIMER] = &timer_vtable, + [UNIT_PATH] = &path_vtable, + [UNIT_SLICE] = &slice_vtable, + [UNIT_SCOPE] = &scope_vtable, +}; + +static void maybe_warn_about_dependency(Unit *u, const char *other, UnitDependency dependency); + +Unit *unit_new(Manager *m, size_t size) { + Unit *u; + + assert(m); + assert(size >= sizeof(Unit)); + + u = malloc0(size); + if (!u) + return NULL; + + u->names = set_new(&string_hash_ops); + if (!u->names) + return mfree(u); + + u->manager = m; + u->type = _UNIT_TYPE_INVALID; + u->default_dependencies = true; + u->unit_file_state = _UNIT_FILE_STATE_INVALID; + u->unit_file_preset = -1; + u->on_failure_job_mode = JOB_REPLACE; + u->cgroup_inotify_wd = -1; + u->job_timeout = USEC_INFINITY; + u->job_running_timeout = USEC_INFINITY; + u->ref_uid = UID_INVALID; + u->ref_gid = GID_INVALID; + u->cpu_usage_last = NSEC_INFINITY; + u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + u->failure_action_exit_status = u->success_action_exit_status = -1; + + u->ip_accounting_ingress_map_fd = -1; + u->ip_accounting_egress_map_fd = -1; + u->ipv4_allow_map_fd = -1; + u->ipv6_allow_map_fd = -1; + u->ipv4_deny_map_fd = -1; + u->ipv6_deny_map_fd = -1; + + u->last_section_private = -1; + + RATELIMIT_INIT(u->start_limit, m->default_start_limit_interval, m->default_start_limit_burst); + RATELIMIT_INIT(u->auto_stop_ratelimit, 10 * USEC_PER_SEC, 16); + + return u; +} + +int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret) { + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + u = unit_new(m, size); + if (!u) + return -ENOMEM; + + r = unit_add_name(u, name); + if (r < 0) + return r; + + *ret = TAKE_PTR(u); + + return r; +} + +bool unit_has_name(const Unit *u, const char *name) { + assert(u); + assert(name); + + return set_contains(u->names, (char*) name); +} + +static void unit_init(Unit *u) { + CGroupContext *cc; + ExecContext *ec; + KillContext *kc; + + assert(u); + assert(u->manager); + assert(u->type >= 0); + + cc = unit_get_cgroup_context(u); + if (cc) { + cgroup_context_init(cc); + + /* Copy in the manager defaults into the cgroup + * context, _before_ the rest of the settings have + * been initialized */ + + cc->cpu_accounting = u->manager->default_cpu_accounting; + cc->io_accounting = u->manager->default_io_accounting; + cc->ip_accounting = u->manager->default_ip_accounting; + cc->blockio_accounting = u->manager->default_blockio_accounting; + cc->memory_accounting = u->manager->default_memory_accounting; + cc->tasks_accounting = u->manager->default_tasks_accounting; + cc->ip_accounting = u->manager->default_ip_accounting; + + if (u->type != UNIT_SLICE) + cc->tasks_max = u->manager->default_tasks_max; + } + + ec = unit_get_exec_context(u); + if (ec) { + exec_context_init(ec); + + ec->keyring_mode = MANAGER_IS_SYSTEM(u->manager) ? + EXEC_KEYRING_SHARED : EXEC_KEYRING_INHERIT; + } + + kc = unit_get_kill_context(u); + if (kc) + kill_context_init(kc); + + if (UNIT_VTABLE(u)->init) + UNIT_VTABLE(u)->init(u); +} + +int unit_add_name(Unit *u, const char *text) { + _cleanup_free_ char *s = NULL, *i = NULL; + UnitType t; + int r; + + assert(u); + assert(text); + + if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) { + + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(text, u->instance, &s); + if (r < 0) + return r; + } else { + s = strdup(text); + if (!s) + return -ENOMEM; + } + + if (set_contains(u->names, s)) + return 0; + if (hashmap_contains(u->manager->units, s)) + return -EEXIST; + + if (!unit_name_is_valid(s, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return -EINVAL; + + t = unit_name_to_type(s); + if (t < 0) + return -EINVAL; + + if (u->type != _UNIT_TYPE_INVALID && t != u->type) + return -EINVAL; + + r = unit_name_to_instance(s, &i); + if (r < 0) + return r; + + if (i && !unit_type_may_template(t)) + return -EINVAL; + + /* Ensure that this unit is either instanced or not instanced, + * but not both. Note that we do allow names with different + * instance names however! */ + if (u->type != _UNIT_TYPE_INVALID && !u->instance != !i) + return -EINVAL; + + if (!unit_type_may_alias(t) && !set_isempty(u->names)) + return -EEXIST; + + if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES) + return -E2BIG; + + r = set_put(u->names, s); + if (r < 0) + return r; + assert(r > 0); + + r = hashmap_put(u->manager->units, s, u); + if (r < 0) { + (void) set_remove(u->names, s); + return r; + } + + if (u->type == _UNIT_TYPE_INVALID) { + u->type = t; + u->id = s; + u->instance = TAKE_PTR(i); + + LIST_PREPEND(units_by_type, u->manager->units_by_type[t], u); + + unit_init(u); + } + + s = NULL; + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_choose_id(Unit *u, const char *name) { + _cleanup_free_ char *t = NULL; + char *s, *i; + int r; + + assert(u); + assert(name); + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(name, u->instance, &t); + if (r < 0) + return r; + + name = t; + } + + /* Selects one of the names of this unit as the id */ + s = set_get(u->names, (char*) name); + if (!s) + return -ENOENT; + + /* Determine the new instance from the new id */ + r = unit_name_to_instance(s, &i); + if (r < 0) + return r; + + u->id = s; + + free(u->instance); + u->instance = i; + + unit_add_to_dbus_queue(u); + + return 0; +} + +int unit_set_description(Unit *u, const char *description) { + int r; + + assert(u); + + r = free_and_strdup(&u->description, empty_to_null(description)); + if (r < 0) + return r; + if (r > 0) + unit_add_to_dbus_queue(u); + + return 0; +} + +bool unit_may_gc(Unit *u) { + UnitActiveState state; + int r; + + assert(u); + + /* Checks whether the unit is ready to be unloaded for garbage collection. + * Returns true when the unit may be collected, and false if there's some + * reason to keep it loaded. + * + * References from other units are *not* checked here. Instead, this is done + * in unit_gc_sweep(), but using markers to properly collect dependency loops. + */ + + if (u->job) + return false; + + if (u->nop_job) + return false; + + state = unit_active_state(u); + + /* If the unit is inactive and failed and no job is queued for it, then release its runtime resources */ + if (UNIT_IS_INACTIVE_OR_FAILED(state) && + UNIT_VTABLE(u)->release_resources) + UNIT_VTABLE(u)->release_resources(u); + + if (u->perpetual) + return false; + + if (sd_bus_track_count(u->bus_track) > 0) + return false; + + /* But we keep the unit object around for longer when it is referenced or configured to not be gc'ed */ + switch (u->collect_mode) { + + case COLLECT_INACTIVE: + if (state != UNIT_INACTIVE) + return false; + + break; + + case COLLECT_INACTIVE_OR_FAILED: + if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED)) + return false; + + break; + + default: + assert_not_reached("Unknown garbage collection mode"); + } + + if (u->cgroup_path) { + /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay + * around. Units with active processes should never be collected. */ + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path); + if (r <= 0) + return false; + } + + if (UNIT_VTABLE(u)->may_gc && !UNIT_VTABLE(u)->may_gc(u)) + return false; + + return true; +} + +void unit_add_to_load_queue(Unit *u) { + assert(u); + assert(u->type != _UNIT_TYPE_INVALID); + + if (u->load_state != UNIT_STUB || u->in_load_queue) + return; + + LIST_PREPEND(load_queue, u->manager->load_queue, u); + u->in_load_queue = true; +} + +void unit_add_to_cleanup_queue(Unit *u) { + assert(u); + + if (u->in_cleanup_queue) + return; + + LIST_PREPEND(cleanup_queue, u->manager->cleanup_queue, u); + u->in_cleanup_queue = true; +} + +void unit_add_to_gc_queue(Unit *u) { + assert(u); + + if (u->in_gc_queue || u->in_cleanup_queue) + return; + + if (!unit_may_gc(u)) + return; + + LIST_PREPEND(gc_queue, u->manager->gc_unit_queue, u); + u->in_gc_queue = true; +} + +void unit_add_to_dbus_queue(Unit *u) { + assert(u); + assert(u->type != _UNIT_TYPE_INVALID); + + if (u->load_state == UNIT_STUB || u->in_dbus_queue) + return; + + /* Shortcut things if nobody cares */ + if (sd_bus_track_count(u->manager->subscribed) <= 0 && + sd_bus_track_count(u->bus_track) <= 0 && + set_isempty(u->manager->private_buses)) { + u->sent_dbus_new_signal = true; + return; + } + + LIST_PREPEND(dbus_queue, u->manager->dbus_unit_queue, u); + u->in_dbus_queue = true; +} + +void unit_submit_to_stop_when_unneeded_queue(Unit *u) { + assert(u); + + if (u->in_stop_when_unneeded_queue) + return; + + if (!u->stop_when_unneeded) + return; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + LIST_PREPEND(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u); + u->in_stop_when_unneeded_queue = true; +} + +static void bidi_set_free(Unit *u, Hashmap *h) { + Unit *other; + Iterator i; + void *v; + + assert(u); + + /* Frees the hashmap and makes sure we are dropped from the inverse pointers */ + + HASHMAP_FOREACH_KEY(v, other, h, i) { + UnitDependency d; + + for (d = 0; d < _UNIT_DEPENDENCY_MAX; d++) + hashmap_remove(other->dependencies[d], u); + + unit_add_to_gc_queue(other); + } + + hashmap_free(h); +} + +static void unit_remove_transient(Unit *u) { + char **i; + + assert(u); + + if (!u->transient) + return; + + if (u->fragment_path) + (void) unlink(u->fragment_path); + + STRV_FOREACH(i, u->dropin_paths) { + _cleanup_free_ char *p = NULL, *pp = NULL; + + p = dirname_malloc(*i); /* Get the drop-in directory from the drop-in file */ + if (!p) + continue; + + pp = dirname_malloc(p); /* Get the config directory from the drop-in directory */ + if (!pp) + continue; + + /* Only drop transient drop-ins */ + if (!path_equal(u->manager->lookup_paths.transient, pp)) + continue; + + (void) unlink(*i); + (void) rmdir(p); + } +} + +static void unit_free_requires_mounts_for(Unit *u) { + assert(u); + + for (;;) { + _cleanup_free_ char *path; + + path = hashmap_steal_first_key(u->requires_mounts_for); + if (!path) + break; + else { + char s[strlen(path) + 1]; + + PATH_FOREACH_PREFIX_MORE(s, path) { + char *y; + Set *x; + + x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y); + if (!x) + continue; + + (void) set_remove(x, u); + + if (set_isempty(x)) { + (void) hashmap_remove(u->manager->units_requiring_mounts_for, y); + free(y); + set_free(x); + } + } + } + } + + u->requires_mounts_for = hashmap_free(u->requires_mounts_for); +} + +static void unit_done(Unit *u) { + ExecContext *ec; + CGroupContext *cc; + + assert(u); + + if (u->type < 0) + return; + + if (UNIT_VTABLE(u)->done) + UNIT_VTABLE(u)->done(u); + + ec = unit_get_exec_context(u); + if (ec) + exec_context_done(ec); + + cc = unit_get_cgroup_context(u); + if (cc) + cgroup_context_done(cc); +} + +void unit_free(Unit *u) { + UnitDependency d; + Iterator i; + char *t; + + if (!u) + return; + + if (UNIT_ISSET(u->slice)) { + /* A unit is being dropped from the tree, make sure our parent slice recalculates the member mask */ + unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice)); + + /* And make sure the parent is realized again, updating cgroup memberships */ + unit_add_to_cgroup_realize_queue(UNIT_DEREF(u->slice)); + } + + u->transient_file = safe_fclose(u->transient_file); + + if (!MANAGER_IS_RELOADING(u->manager)) + unit_remove_transient(u); + + bus_unit_send_removed_signal(u); + + unit_done(u); + + unit_dequeue_rewatch_pids(u); + + sd_bus_slot_unref(u->match_bus_slot); + sd_bus_track_unref(u->bus_track); + u->deserialized_refs = strv_free(u->deserialized_refs); + + unit_free_requires_mounts_for(u); + + SET_FOREACH(t, u->names, i) + hashmap_remove_value(u->manager->units, t, u); + + if (!sd_id128_is_null(u->invocation_id)) + hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (u->job) { + Job *j = u->job; + job_uninstall(j); + job_free(j); + } + + if (u->nop_job) { + Job *j = u->nop_job; + job_uninstall(j); + job_free(j); + } + + for (d = 0; d < _UNIT_DEPENDENCY_MAX; d++) + bidi_set_free(u, u->dependencies[d]); + + if (u->on_console) + manager_unref_console(u->manager); + + unit_release_cgroup(u); + + if (!MANAGER_IS_RELOADING(u->manager)) + unit_unlink_state_files(u); + + unit_unref_uid_gid(u, false); + + (void) manager_update_failed_units(u->manager, u, false); + set_remove(u->manager->startup_units, u); + + unit_unwatch_all_pids(u); + + unit_ref_unset(&u->slice); + while (u->refs_by_target) + unit_ref_unset(u->refs_by_target); + + if (u->type != _UNIT_TYPE_INVALID) + LIST_REMOVE(units_by_type, u->manager->units_by_type[u->type], u); + + if (u->in_load_queue) + LIST_REMOVE(load_queue, u->manager->load_queue, u); + + if (u->in_dbus_queue) + LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u); + + if (u->in_gc_queue) + LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u); + + if (u->in_cgroup_realize_queue) + LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + + if (u->in_cgroup_empty_queue) + LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + + if (u->in_cleanup_queue) + LIST_REMOVE(cleanup_queue, u->manager->cleanup_queue, u); + + if (u->in_target_deps_queue) + LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u); + + if (u->in_stop_when_unneeded_queue) + LIST_REMOVE(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u); + + safe_close(u->ip_accounting_ingress_map_fd); + safe_close(u->ip_accounting_egress_map_fd); + + safe_close(u->ipv4_allow_map_fd); + safe_close(u->ipv6_allow_map_fd); + safe_close(u->ipv4_deny_map_fd); + safe_close(u->ipv6_deny_map_fd); + + bpf_program_unref(u->ip_bpf_ingress); + bpf_program_unref(u->ip_bpf_ingress_installed); + bpf_program_unref(u->ip_bpf_egress); + bpf_program_unref(u->ip_bpf_egress_installed); + + bpf_program_unref(u->bpf_device_control_installed); + + condition_free_list(u->conditions); + condition_free_list(u->asserts); + + free(u->description); + strv_free(u->documentation); + free(u->fragment_path); + free(u->source_path); + strv_free(u->dropin_paths); + free(u->instance); + + free(u->job_timeout_reboot_arg); + + set_free_free(u->names); + + free(u->reboot_arg); + + free(u); +} + +UnitActiveState unit_active_state(Unit *u) { + assert(u); + + if (u->load_state == UNIT_MERGED) + return unit_active_state(unit_follow_merge(u)); + + /* After a reload it might happen that a unit is not correctly + * loaded but still has a process around. That's why we won't + * shortcut failed loading to UNIT_INACTIVE_FAILED. */ + + return UNIT_VTABLE(u)->active_state(u); +} + +const char* unit_sub_state_to_string(Unit *u) { + assert(u); + + return UNIT_VTABLE(u)->sub_state_to_string(u); +} + +static int set_complete_move(Set **s, Set **other) { + assert(s); + assert(other); + + if (!other) + return 0; + + if (*s) + return set_move(*s, *other); + else + *s = TAKE_PTR(*other); + + return 0; +} + +static int hashmap_complete_move(Hashmap **s, Hashmap **other) { + assert(s); + assert(other); + + if (!*other) + return 0; + + if (*s) + return hashmap_move(*s, *other); + else + *s = TAKE_PTR(*other); + + return 0; +} + +static int merge_names(Unit *u, Unit *other) { + char *t; + Iterator i; + int r; + + assert(u); + assert(other); + + r = set_complete_move(&u->names, &other->names); + if (r < 0) + return r; + + set_free_free(other->names); + other->names = NULL; + other->id = NULL; + + SET_FOREACH(t, u->names, i) + assert_se(hashmap_replace(u->manager->units, t, u) == 0); + + return 0; +} + +static int reserve_dependencies(Unit *u, Unit *other, UnitDependency d) { + unsigned n_reserve; + + assert(u); + assert(other); + assert(d < _UNIT_DEPENDENCY_MAX); + + /* + * If u does not have this dependency set allocated, there is no need + * to reserve anything. In that case other's set will be transferred + * as a whole to u by complete_move(). + */ + if (!u->dependencies[d]) + return 0; + + /* merge_dependencies() will skip a u-on-u dependency */ + n_reserve = hashmap_size(other->dependencies[d]) - !!hashmap_get(other->dependencies[d], u); + + return hashmap_reserve(u->dependencies[d], n_reserve); +} + +static void merge_dependencies(Unit *u, Unit *other, const char *other_id, UnitDependency d) { + Iterator i; + Unit *back; + void *v; + int r; + + /* Merges all dependencies of type 'd' of the unit 'other' into the deps of the unit 'u' */ + + assert(u); + assert(other); + assert(d < _UNIT_DEPENDENCY_MAX); + + /* Fix backwards pointers. Let's iterate through all dependendent units of the other unit. */ + HASHMAP_FOREACH_KEY(v, back, other->dependencies[d], i) { + UnitDependency k; + + /* Let's now iterate through the dependencies of that dependencies of the other units, looking for + * pointers back, and let's fix them up, to instead point to 'u'. */ + + for (k = 0; k < _UNIT_DEPENDENCY_MAX; k++) { + if (back == u) { + /* Do not add dependencies between u and itself. */ + if (hashmap_remove(back->dependencies[k], other)) + maybe_warn_about_dependency(u, other_id, k); + } else { + UnitDependencyInfo di_u, di_other, di_merged; + + /* Let's drop this dependency between "back" and "other", and let's create it between + * "back" and "u" instead. Let's merge the bit masks of the dependency we are moving, + * and any such dependency which might already exist */ + + di_other.data = hashmap_get(back->dependencies[k], other); + if (!di_other.data) + continue; /* dependency isn't set, let's try the next one */ + + di_u.data = hashmap_get(back->dependencies[k], u); + + di_merged = (UnitDependencyInfo) { + .origin_mask = di_u.origin_mask | di_other.origin_mask, + .destination_mask = di_u.destination_mask | di_other.destination_mask, + }; + + r = hashmap_remove_and_replace(back->dependencies[k], other, u, di_merged.data); + if (r < 0) + log_warning_errno(r, "Failed to remove/replace: back=%s other=%s u=%s: %m", back->id, other_id, u->id); + assert(r >= 0); + + /* assert_se(hashmap_remove_and_replace(back->dependencies[k], other, u, di_merged.data) >= 0); */ + } + } + + } + + /* Also do not move dependencies on u to itself */ + back = hashmap_remove(other->dependencies[d], u); + if (back) + maybe_warn_about_dependency(u, other_id, d); + + /* The move cannot fail. The caller must have performed a reservation. */ + assert_se(hashmap_complete_move(&u->dependencies[d], &other->dependencies[d]) == 0); + + other->dependencies[d] = hashmap_free(other->dependencies[d]); +} + +int unit_merge(Unit *u, Unit *other) { + UnitDependency d; + const char *other_id = NULL; + int r; + + assert(u); + assert(other); + assert(u->manager == other->manager); + assert(u->type != _UNIT_TYPE_INVALID); + + other = unit_follow_merge(other); + + if (other == u) + return 0; + + if (u->type != other->type) + return -EINVAL; + + if (!u->instance != !other->instance) + return -EINVAL; + + if (!unit_type_may_alias(u->type)) /* Merging only applies to unit names that support aliases */ + return -EEXIST; + + if (!IN_SET(other->load_state, UNIT_STUB, UNIT_NOT_FOUND)) + return -EEXIST; + + if (other->job) + return -EEXIST; + + if (other->nop_job) + return -EEXIST; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + return -EEXIST; + + if (other->id) + other_id = strdupa(other->id); + + /* Make reservations to ensure merge_dependencies() won't fail */ + for (d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + r = reserve_dependencies(u, other, d); + /* + * We don't rollback reservations if we fail. We don't have + * a way to undo reservations. A reservation is not a leak. + */ + if (r < 0) + return r; + } + + /* Merge names */ + r = merge_names(u, other); + if (r < 0) + return r; + + /* Redirect all references */ + while (other->refs_by_target) + unit_ref_set(other->refs_by_target, other->refs_by_target->source, u); + + /* Merge dependencies */ + for (d = 0; d < _UNIT_DEPENDENCY_MAX; d++) + merge_dependencies(u, other, other_id, d); + + other->load_state = UNIT_MERGED; + other->merged_into = u; + + /* If there is still some data attached to the other node, we + * don't need it anymore, and can free it. */ + if (other->load_state != UNIT_STUB) + if (UNIT_VTABLE(other)->done) + UNIT_VTABLE(other)->done(other); + + unit_add_to_dbus_queue(u); + unit_add_to_cleanup_queue(other); + + return 0; +} + +int unit_merge_by_name(Unit *u, const char *name) { + _cleanup_free_ char *s = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(name, u->instance, &s); + if (r < 0) + return r; + + name = s; + } + + other = manager_get_unit(u->manager, name); + if (other) + return unit_merge(u, other); + + return unit_add_name(u, name); +} + +Unit* unit_follow_merge(Unit *u) { + assert(u); + + while (u->load_state == UNIT_MERGED) + assert_se(u = u->merged_into); + + return u; +} + +int unit_add_exec_dependencies(Unit *u, ExecContext *c) { + ExecDirectoryType dt; + char **dp; + int r; + + assert(u); + assert(c); + + if (c->working_directory && !c->working_directory_missing_ok) { + r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_directory) { + r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_image) { + r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + if (!u->manager->prefix[dt]) + continue; + + STRV_FOREACH(dp, c->directories[dt].paths) { + _cleanup_free_ char *p; + + p = strjoin(u->manager->prefix[dt], "/", *dp); + if (!p) + return -ENOMEM; + + r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + } + + if (!MANAGER_IS_SYSTEM(u->manager)) + return 0; + + if (c->private_tmp) { + const char *p; + + FOREACH_STRING(p, "/tmp", "/var/tmp") { + r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_TMPFILES_SETUP_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (!IN_SET(c->std_output, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_SYSLOG, EXEC_OUTPUT_SYSLOG_AND_CONSOLE) && + !IN_SET(c->std_error, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_SYSLOG, EXEC_OUTPUT_SYSLOG_AND_CONSOLE)) + return 0; + + /* If syslog or kernel logging is requested, make sure our own + * logging daemon is run first. */ + + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + return 0; +} + +const char *unit_description(Unit *u) { + assert(u); + + if (u->description) + return u->description; + + return strna(u->id); +} + +static void print_unit_dependency_mask(FILE *f, const char *kind, UnitDependencyMask mask, bool *space) { + const struct { + UnitDependencyMask mask; + const char *name; + } table[] = { + { UNIT_DEPENDENCY_FILE, "file" }, + { UNIT_DEPENDENCY_IMPLICIT, "implicit" }, + { UNIT_DEPENDENCY_DEFAULT, "default" }, + { UNIT_DEPENDENCY_UDEV, "udev" }, + { UNIT_DEPENDENCY_PATH, "path" }, + { UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT, "mountinfo-implicit" }, + { UNIT_DEPENDENCY_MOUNTINFO_DEFAULT, "mountinfo-default" }, + { UNIT_DEPENDENCY_PROC_SWAP, "proc-swap" }, + }; + size_t i; + + assert(f); + assert(kind); + assert(space); + + for (i = 0; i < ELEMENTSOF(table); i++) { + + if (mask == 0) + break; + + if (FLAGS_SET(mask, table[i].mask)) { + if (*space) + fputc(' ', f); + else + *space = true; + + fputs(kind, f); + fputs("-", f); + fputs(table[i].name, f); + + mask &= ~table[i].mask; + } + } + + assert(mask == 0); +} + +void unit_dump(Unit *u, FILE *f, const char *prefix) { + char *t, **j; + UnitDependency d; + Iterator i; + const char *prefix2; + char + timestamp0[FORMAT_TIMESTAMP_MAX], + timestamp1[FORMAT_TIMESTAMP_MAX], + timestamp2[FORMAT_TIMESTAMP_MAX], + timestamp3[FORMAT_TIMESTAMP_MAX], + timestamp4[FORMAT_TIMESTAMP_MAX], + timespan[FORMAT_TIMESPAN_MAX]; + Unit *following; + _cleanup_set_free_ Set *following_set = NULL; + const char *n; + CGroupMask m; + int r; + + assert(u); + assert(u->type >= 0); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%s-> Unit %s:\n" + "%s\tDescription: %s\n" + "%s\tInstance: %s\n" + "%s\tUnit Load State: %s\n" + "%s\tUnit Active State: %s\n" + "%s\tState Change Timestamp: %s\n" + "%s\tInactive Exit Timestamp: %s\n" + "%s\tActive Enter Timestamp: %s\n" + "%s\tActive Exit Timestamp: %s\n" + "%s\tInactive Enter Timestamp: %s\n" + "%s\tMay GC: %s\n" + "%s\tNeed Daemon Reload: %s\n" + "%s\tTransient: %s\n" + "%s\tPerpetual: %s\n" + "%s\tGarbage Collection Mode: %s\n" + "%s\tSlice: %s\n" + "%s\tCGroup: %s\n" + "%s\tCGroup realized: %s\n", + prefix, u->id, + prefix, unit_description(u), + prefix, strna(u->instance), + prefix, unit_load_state_to_string(u->load_state), + prefix, unit_active_state_to_string(unit_active_state(u)), + prefix, strna(format_timestamp(timestamp0, sizeof(timestamp0), u->state_change_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp1, sizeof(timestamp1), u->inactive_exit_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp2, sizeof(timestamp2), u->active_enter_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp3, sizeof(timestamp3), u->active_exit_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp4, sizeof(timestamp4), u->inactive_enter_timestamp.realtime)), + prefix, yes_no(unit_may_gc(u)), + prefix, yes_no(unit_need_daemon_reload(u)), + prefix, yes_no(u->transient), + prefix, yes_no(u->perpetual), + prefix, collect_mode_to_string(u->collect_mode), + prefix, strna(unit_slice_name(u)), + prefix, strna(u->cgroup_path), + prefix, yes_no(u->cgroup_realized)); + + if (u->cgroup_realized_mask != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(u->cgroup_realized_mask, &s); + fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s)); + } + + if (u->cgroup_enabled_mask != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(u->cgroup_enabled_mask, &s); + fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_own_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_members_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_delegate_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s)); + } + + SET_FOREACH(t, u->names, i) + fprintf(f, "%s\tName: %s\n", prefix, t); + + if (!sd_id128_is_null(u->invocation_id)) + fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n", + prefix, SD_ID128_FORMAT_VAL(u->invocation_id)); + + STRV_FOREACH(j, u->documentation) + fprintf(f, "%s\tDocumentation: %s\n", prefix, *j); + + following = unit_following(u); + if (following) + fprintf(f, "%s\tFollowing: %s\n", prefix, following->id); + + r = unit_following_set(u, &following_set); + if (r >= 0) { + Unit *other; + + SET_FOREACH(other, following_set, i) + fprintf(f, "%s\tFollowing Set Member: %s\n", prefix, other->id); + } + + if (u->fragment_path) + fprintf(f, "%s\tFragment Path: %s\n", prefix, u->fragment_path); + + if (u->source_path) + fprintf(f, "%s\tSource Path: %s\n", prefix, u->source_path); + + STRV_FOREACH(j, u->dropin_paths) + fprintf(f, "%s\tDropIn Path: %s\n", prefix, *j); + + if (u->failure_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tFailure Action: %s\n", prefix, emergency_action_to_string(u->failure_action)); + if (u->failure_action_exit_status >= 0) + fprintf(f, "%s\tFailure Action Exit Status: %i\n", prefix, u->failure_action_exit_status); + if (u->success_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tSuccess Action: %s\n", prefix, emergency_action_to_string(u->success_action)); + if (u->success_action_exit_status >= 0) + fprintf(f, "%s\tSuccess Action Exit Status: %i\n", prefix, u->success_action_exit_status); + + if (u->job_timeout != USEC_INFINITY) + fprintf(f, "%s\tJob Timeout: %s\n", prefix, format_timespan(timespan, sizeof(timespan), u->job_timeout, 0)); + + if (u->job_timeout_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tJob Timeout Action: %s\n", prefix, emergency_action_to_string(u->job_timeout_action)); + + if (u->job_timeout_reboot_arg) + fprintf(f, "%s\tJob Timeout Reboot Argument: %s\n", prefix, u->job_timeout_reboot_arg); + + condition_dump_list(u->conditions, f, prefix, condition_type_to_string); + condition_dump_list(u->asserts, f, prefix, assert_type_to_string); + + if (dual_timestamp_is_set(&u->condition_timestamp)) + fprintf(f, + "%s\tCondition Timestamp: %s\n" + "%s\tCondition Result: %s\n", + prefix, strna(format_timestamp(timestamp1, sizeof(timestamp1), u->condition_timestamp.realtime)), + prefix, yes_no(u->condition_result)); + + if (dual_timestamp_is_set(&u->assert_timestamp)) + fprintf(f, + "%s\tAssert Timestamp: %s\n" + "%s\tAssert Result: %s\n", + prefix, strna(format_timestamp(timestamp1, sizeof(timestamp1), u->assert_timestamp.realtime)), + prefix, yes_no(u->assert_result)); + + for (d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + UnitDependencyInfo di; + Unit *other; + + HASHMAP_FOREACH_KEY(di.data, other, u->dependencies[d], i) { + bool space = false; + + fprintf(f, "%s\t%s: %s (", prefix, unit_dependency_to_string(d), other->id); + + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + + fputs(")\n", f); + } + } + + if (!hashmap_isempty(u->requires_mounts_for)) { + UnitDependencyInfo di; + const char *path; + + HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for, i) { + bool space = false; + + fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path); + + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + + fputs(")\n", f); + } + } + + if (u->load_state == UNIT_LOADED) { + + fprintf(f, + "%s\tStopWhenUnneeded: %s\n" + "%s\tRefuseManualStart: %s\n" + "%s\tRefuseManualStop: %s\n" + "%s\tDefaultDependencies: %s\n" + "%s\tOnFailureJobMode: %s\n" + "%s\tIgnoreOnIsolate: %s\n", + prefix, yes_no(u->stop_when_unneeded), + prefix, yes_no(u->refuse_manual_start), + prefix, yes_no(u->refuse_manual_stop), + prefix, yes_no(u->default_dependencies), + prefix, job_mode_to_string(u->on_failure_job_mode), + prefix, yes_no(u->ignore_on_isolate)); + + if (UNIT_VTABLE(u)->dump) + UNIT_VTABLE(u)->dump(u, f, prefix2); + + } else if (u->load_state == UNIT_MERGED) + fprintf(f, + "%s\tMerged into: %s\n", + prefix, u->merged_into->id); + else if (u->load_state == UNIT_ERROR) + fprintf(f, "%s\tLoad Error Code: %s\n", prefix, strerror(-u->load_error)); + + for (n = sd_bus_track_first(u->bus_track); n; n = sd_bus_track_next(u->bus_track)) + fprintf(f, "%s\tBus Ref: %s\n", prefix, n); + + if (u->job) + job_dump(u->job, f, prefix2); + + if (u->nop_job) + job_dump(u->nop_job, f, prefix2); +} + +/* Common implementation for multiple backends */ +int unit_load_fragment_and_dropin(Unit *u) { + int r; + + assert(u); + + /* Load a .{service,socket,...} file */ + r = unit_load_fragment(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_STUB) + return -ENOENT; + + /* Load drop-in directory data. If u is an alias, we might be reloading the + * target unit needlessly. But we cannot be sure which drops-ins have already + * been loaded and which not, at least without doing complicated book-keeping, + * so let's always reread all drop-ins. */ + return unit_load_dropin(unit_follow_merge(u)); +} + +/* Common implementation for multiple backends */ +int unit_load_fragment_and_dropin_optional(Unit *u) { + int r; + + assert(u); + + /* Same as unit_load_fragment_and_dropin(), but whether + * something can be loaded or not doesn't matter. */ + + /* Load a .service/.socket/.slice/… file */ + r = unit_load_fragment(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_STUB) + u->load_state = UNIT_LOADED; + + /* Load drop-in directory data */ + return unit_load_dropin(unit_follow_merge(u)); +} + +void unit_add_to_target_deps_queue(Unit *u) { + Manager *m = u->manager; + + assert(u); + + if (u->in_target_deps_queue) + return; + + LIST_PREPEND(target_deps_queue, m->target_deps_queue, u); + u->in_target_deps_queue = true; +} + +int unit_add_default_target_dependency(Unit *u, Unit *target) { + assert(u); + assert(target); + + if (target->type != UNIT_TARGET) + return 0; + + /* Only add the dependency if both units are loaded, so that + * that loop check below is reliable */ + if (u->load_state != UNIT_LOADED || + target->load_state != UNIT_LOADED) + return 0; + + /* If either side wants no automatic dependencies, then let's + * skip this */ + if (!u->default_dependencies || + !target->default_dependencies) + return 0; + + /* Don't create loops */ + if (hashmap_get(target->dependencies[UNIT_BEFORE], u)) + return 0; + + return unit_add_dependency(target, UNIT_AFTER, u, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int unit_add_slice_dependencies(Unit *u) { + UnitDependencyMask mask; + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* Slice units are implicitly ordered against their parent slices (as this relationship is encoded in the + name), while all other units are ordered based on configuration (as in their case Slice= configures the + relationship). */ + mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE; + + if (UNIT_ISSET(u->slice)) + return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, UNIT_DEREF(u->slice), true, mask); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, SPECIAL_ROOT_SLICE, true, mask); +} + +static int unit_add_mount_dependencies(Unit *u) { + UnitDependencyInfo di; + const char *path; + Iterator i; + int r; + + assert(u); + + HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for, i) { + char prefix[strlen(path) + 1]; + + PATH_FOREACH_PREFIX_MORE(prefix, path) { + _cleanup_free_ char *p = NULL; + Unit *m; + + r = unit_name_from_path(prefix, ".mount", &p); + if (r < 0) + return r; + + m = manager_get_unit(u->manager, p); + if (!m) { + /* Make sure to load the mount unit if + * it exists. If so the dependencies + * on this unit will be added later + * during the loading of the mount + * unit. */ + (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m); + continue; + } + if (m == u) + continue; + + if (m->load_state != UNIT_LOADED) + continue; + + r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask); + if (r < 0) + return r; + + if (m->fragment_path) { + r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask); + if (r < 0) + return r; + } + } + } + + return 0; +} + +static int unit_add_startup_units(Unit *u) { + CGroupContext *c; + int r; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + if (c->startup_cpu_shares == CGROUP_CPU_SHARES_INVALID && + c->startup_io_weight == CGROUP_WEIGHT_INVALID && + c->startup_blockio_weight == CGROUP_BLKIO_WEIGHT_INVALID) + return 0; + + r = set_ensure_allocated(&u->manager->startup_units, NULL); + if (r < 0) + return r; + + return set_put(u->manager->startup_units, u); +} + +int unit_load(Unit *u) { + int r; + + assert(u); + + if (u->in_load_queue) { + LIST_REMOVE(load_queue, u->manager->load_queue, u); + u->in_load_queue = false; + } + + if (u->type == _UNIT_TYPE_INVALID) + return -EINVAL; + + if (u->load_state != UNIT_STUB) + return 0; + + if (u->transient_file) { + /* Finalize transient file: if this is a transient unit file, as soon as we reach unit_load() the setup + * is complete, hence let's synchronize the unit file we just wrote to disk. */ + + r = fflush_and_check(u->transient_file); + if (r < 0) + goto fail; + + u->transient_file = safe_fclose(u->transient_file); + u->fragment_mtime = now(CLOCK_REALTIME); + } + + if (UNIT_VTABLE(u)->load) { + r = UNIT_VTABLE(u)->load(u); + if (r < 0) + goto fail; + } + + if (u->load_state == UNIT_STUB) { + r = -ENOENT; + goto fail; + } + + if (u->load_state == UNIT_LOADED) { + unit_add_to_target_deps_queue(u); + + r = unit_add_slice_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_mount_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_startup_units(u); + if (r < 0) + goto fail; + + if (u->on_failure_job_mode == JOB_ISOLATE && hashmap_size(u->dependencies[UNIT_ON_FAILURE]) > 1) { + log_unit_error(u, "More than one OnFailure= dependencies specified but OnFailureJobMode=isolate set. Refusing."); + r = -ENOEXEC; + goto fail; + } + + if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout) + log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect."); + + /* We finished loading, let's ensure our parents recalculate the members mask */ + unit_invalidate_cgroup_members_masks(u); + } + + assert((u->load_state != UNIT_MERGED) == !u->merged_into); + + unit_add_to_dbus_queue(unit_follow_merge(u)); + unit_add_to_gc_queue(u); + + return 0; + +fail: + /* We convert ENOEXEC errors to the UNIT_BAD_SETTING load state here. Configuration parsing code should hence + * return ENOEXEC to ensure units are placed in this state after loading */ + + u->load_state = u->load_state == UNIT_STUB ? UNIT_NOT_FOUND : + r == -ENOEXEC ? UNIT_BAD_SETTING : + UNIT_ERROR; + u->load_error = r; + + unit_add_to_dbus_queue(u); + unit_add_to_gc_queue(u); + + return log_unit_debug_errno(u, r, "Failed to load configuration: %m"); +} + +static bool unit_condition_test_list(Unit *u, Condition *first, const char *(*to_string)(ConditionType t)) { + Condition *c; + int triggered = -1; + + assert(u); + assert(to_string); + + /* If the condition list is empty, then it is true */ + if (!first) + return true; + + /* Otherwise, if all of the non-trigger conditions apply and + * if any of the trigger conditions apply (unless there are + * none) we return true */ + LIST_FOREACH(conditions, c, first) { + int r; + + r = condition_test(c); + if (r < 0) + log_unit_warning(u, + "Couldn't determine result for %s=%s%s%s, assuming failed: %m", + to_string(c->type), + c->trigger ? "|" : "", + c->negate ? "!" : "", + c->parameter); + else + log_unit_debug(u, + "%s=%s%s%s %s.", + to_string(c->type), + c->trigger ? "|" : "", + c->negate ? "!" : "", + c->parameter, + condition_result_to_string(c->result)); + + if (!c->trigger && r <= 0) + return false; + + if (c->trigger && triggered <= 0) + triggered = r > 0; + } + + return triggered != 0; +} + +static bool unit_condition_test(Unit *u) { + assert(u); + + dual_timestamp_get(&u->condition_timestamp); + u->condition_result = unit_condition_test_list(u, u->conditions, condition_type_to_string); + + unit_add_to_dbus_queue(u); + + return u->condition_result; +} + +static bool unit_assert_test(Unit *u) { + assert(u); + + dual_timestamp_get(&u->assert_timestamp); + u->assert_result = unit_condition_test_list(u, u->asserts, assert_type_to_string); + + unit_add_to_dbus_queue(u); + + return u->assert_result; +} + +void unit_status_printf(Unit *u, const char *status, const char *unit_status_msg_format) { + const char *d; + + d = unit_description(u); + if (log_get_show_color()) + d = strjoina(ANSI_HIGHLIGHT, d, ANSI_NORMAL); + + DISABLE_WARNING_FORMAT_NONLITERAL; + manager_status_printf(u->manager, STATUS_TYPE_NORMAL, status, unit_status_msg_format, d); + REENABLE_WARNING; +} + +int unit_start_limit_test(Unit *u) { + const char *reason; + + assert(u); + + if (ratelimit_below(&u->start_limit)) { + u->start_limit_hit = false; + return 0; + } + + log_unit_warning(u, "Start request repeated too quickly."); + u->start_limit_hit = true; + + reason = strjoina("unit ", u->id, " failed"); + + return emergency_action(u->manager, u->start_limit_action, + EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN, + u->reboot_arg, -1, reason); +} + +bool unit_shall_confirm_spawn(Unit *u) { + assert(u); + + if (manager_is_confirm_spawn_disabled(u->manager)) + return false; + + /* For some reasons units remaining in the same process group + * as PID 1 fail to acquire the console even if it's not used + * by any process. So skip the confirmation question for them. */ + return !unit_get_exec_context(u)->same_pgrp; +} + +static bool unit_verify_deps(Unit *u) { + Unit *other; + Iterator j; + void *v; + + assert(u); + + /* Checks whether all BindsTo= dependencies of this unit are fulfilled — if they are also combined with + * After=. We do not check Requires= or Requisite= here as they only should have an effect on the job + * processing, but do not have any effect afterwards. We don't check BindsTo= dependencies that are not used in + * conjunction with After= as for them any such check would make things entirely racy. */ + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BINDS_TO], j) { + + if (!hashmap_contains(u->dependencies[UNIT_AFTER], other)) + continue; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) { + log_unit_notice(u, "Bound to unit %s, but unit isn't active.", other->id); + return false; + } + } + + return true; +} + +/* Errors: + * -EBADR: This unit type does not support starting. + * -EALREADY: Unit is already started. + * -EAGAIN: An operation is already in progress. Retry later. + * -ECANCELED: Too many requests for now. + * -EPROTO: Assert failed + * -EINVAL: Unit not loaded + * -EOPNOTSUPP: Unit type not supported + * -ENOLINK: The necessary dependencies are not fulfilled. + * -ESTALE: This unit has been started before and can't be started a second time + */ +int unit_start(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + /* If this is already started, then this will succeed. Note + * that this will even succeed if this unit is not startable + * by the user. This is relied on to detect when we need to + * wait for units and when waiting is finished. */ + state = unit_active_state(u); + if (UNIT_IS_ACTIVE_OR_RELOADING(state)) + return -EALREADY; + + /* Units that aren't loaded cannot be started */ + if (u->load_state != UNIT_LOADED) + return -EINVAL; + + /* Refuse starting scope units more than once */ + if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_enter_timestamp)) + return -ESTALE; + + /* If the conditions failed, don't do anything at all. If we + * already are activating this call might still be useful to + * speed up activation in case there is some hold-off time, + * but we don't want to recheck the condition in that case. */ + if (state != UNIT_ACTIVATING && + !unit_condition_test(u)) { + log_unit_debug(u, "Starting requested but condition failed. Not starting unit."); + return -ECOMM; + } + + /* If the asserts failed, fail the entire job */ + if (state != UNIT_ACTIVATING && + !unit_assert_test(u)) { + log_unit_notice(u, "Starting requested but asserts failed."); + return -EPROTO; + } + + /* Units of types that aren't supported cannot be + * started. Note that we do this test only after the condition + * checks, so that we rather return condition check errors + * (which are usually not considered a true failure) than "not + * supported" errors (which are considered a failure). + */ + if (!unit_supported(u)) + return -EOPNOTSUPP; + + /* Let's make sure that the deps really are in order before we start this. Normally the job engine should have + * taken care of this already, but let's check this here again. After all, our dependencies might not be in + * effect anymore, due to a reload or due to a failed condition. */ + if (!unit_verify_deps(u)) + return -ENOLINK; + + /* Forward to the main object, if we aren't it. */ + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting start request from %s to %s.", u->id, following->id); + return unit_start(following); + } + + /* If it is stopped, but we cannot start it, then fail */ + if (!UNIT_VTABLE(u)->start) + return -EBADR; + + /* We don't suppress calls to ->start() here when we are + * already starting, to allow this request to be used as a + * "hurry up" call, for example when the unit is in some "auto + * restart" state where it waits for a holdoff timer to elapse + * before it will start again. */ + + unit_add_to_dbus_queue(u); + + return UNIT_VTABLE(u)->start(u); +} + +bool unit_can_start(Unit *u) { + assert(u); + + if (u->load_state != UNIT_LOADED) + return false; + + if (!unit_supported(u)) + return false; + + /* Scope units may be started only once */ + if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_exit_timestamp)) + return false; + + return !!UNIT_VTABLE(u)->start; +} + +bool unit_can_isolate(Unit *u) { + assert(u); + + return unit_can_start(u) && + u->allow_isolate; +} + +/* Errors: + * -EBADR: This unit type does not support stopping. + * -EALREADY: Unit is already stopped. + * -EAGAIN: An operation is already in progress. Retry later. + */ +int unit_stop(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + state = unit_active_state(u); + if (UNIT_IS_INACTIVE_OR_FAILED(state)) + return -EALREADY; + + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting stop request from %s to %s.", u->id, following->id); + return unit_stop(following); + } + + if (!UNIT_VTABLE(u)->stop) + return -EBADR; + + unit_add_to_dbus_queue(u); + + return UNIT_VTABLE(u)->stop(u); +} + +bool unit_can_stop(Unit *u) { + assert(u); + + if (!unit_supported(u)) + return false; + + if (u->perpetual) + return false; + + return !!UNIT_VTABLE(u)->stop; +} + +/* Errors: + * -EBADR: This unit type does not support reloading. + * -ENOEXEC: Unit is not started. + * -EAGAIN: An operation is already in progress. Retry later. + */ +int unit_reload(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + if (u->load_state != UNIT_LOADED) + return -EINVAL; + + if (!unit_can_reload(u)) + return -EBADR; + + state = unit_active_state(u); + if (state == UNIT_RELOADING) + return -EALREADY; + + if (state != UNIT_ACTIVE) { + log_unit_warning(u, "Unit cannot be reloaded because it is inactive."); + return -ENOEXEC; + } + + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting reload request from %s to %s.", u->id, following->id); + return unit_reload(following); + } + + unit_add_to_dbus_queue(u); + + if (!UNIT_VTABLE(u)->reload) { + /* Unit doesn't have a reload function, but we need to propagate the reload anyway */ + unit_notify(u, unit_active_state(u), unit_active_state(u), 0); + return 0; + } + + return UNIT_VTABLE(u)->reload(u); +} + +bool unit_can_reload(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->can_reload) + return UNIT_VTABLE(u)->can_reload(u); + + if (!hashmap_isempty(u->dependencies[UNIT_PROPAGATES_RELOAD_TO])) + return true; + + return UNIT_VTABLE(u)->reload; +} + +bool unit_is_unneeded(Unit *u) { + static const UnitDependency deps[] = { + UNIT_REQUIRED_BY, + UNIT_REQUISITE_OF, + UNIT_WANTED_BY, + UNIT_BOUND_BY, + }; + size_t j; + + assert(u); + + if (!u->stop_when_unneeded) + return false; + + /* Don't clean up while the unit is transitioning or is even inactive. */ + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return false; + if (u->job) + return false; + + for (j = 0; j < ELEMENTSOF(deps); j++) { + Unit *other; + Iterator i; + void *v; + + /* If a dependent unit has a job queued, is active or transitioning, or is marked for + * restart, then don't clean this one up. */ + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[deps[j]], i) { + if (other->job) + return false; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + return false; + + if (unit_will_restart(other)) + return false; + } + } + + return true; +} + +static void check_unneeded_dependencies(Unit *u) { + + static const UnitDependency deps[] = { + UNIT_REQUIRES, + UNIT_REQUISITE, + UNIT_WANTS, + UNIT_BINDS_TO, + }; + size_t j; + + assert(u); + + /* Add all units this unit depends on to the queue that processes StopWhenUnneeded= behaviour. */ + + for (j = 0; j < ELEMENTSOF(deps); j++) { + Unit *other; + Iterator i; + void *v; + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[deps[j]], i) + unit_submit_to_stop_when_unneeded_queue(other); + } +} + +static void unit_check_binds_to(Unit *u) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool stop = false; + Unit *other; + Iterator i; + void *v; + int r; + + assert(u); + + if (u->job) + return; + + if (unit_active_state(u) != UNIT_ACTIVE) + return; + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BINDS_TO], i) { + if (other->job) + continue; + + if (!other->coldplugged) + /* We might yet create a job for the other unit… */ + continue; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + continue; + + stop = true; + break; + } + + if (!stop) + return; + + /* If stopping a unit fails continuously we might enter a stop + * loop here, hence stop acting on the service being + * unnecessary after a while. */ + if (!ratelimit_below(&u->auto_stop_ratelimit)) { + log_unit_warning(u, "Unit is bound to inactive unit %s, but not stopping since we tried this too often recently.", other->id); + return; + } + + assert(other); + log_unit_info(u, "Unit is bound to inactive unit %s. Stopping, too.", other->id); + + /* A unit we need to run is gone. Sniff. Let's stop this. */ + r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r)); +} + +static void retroactively_start_dependencies(Unit *u) { + Iterator i; + Unit *other; + void *v; + + assert(u); + assert(UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u))); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REQUIRES], i) + if (!hashmap_get(u->dependencies[UNIT_AFTER], other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BINDS_TO], i) + if (!hashmap_get(u->dependencies[UNIT_AFTER], other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_WANTS], i) + if (!hashmap_get(u->dependencies[UNIT_AFTER], other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_CONFLICTS], i) + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_CONFLICTED_BY], i) + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL); +} + +static void retroactively_stop_dependencies(Unit *u) { + Unit *other; + Iterator i; + void *v; + + assert(u); + assert(UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u))); + + /* Pull down units which are bound to us recursively if enabled */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BOUND_BY], i) + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL); +} + +void unit_start_on_failure(Unit *u) { + Unit *other; + Iterator i; + void *v; + int r; + + assert(u); + + if (hashmap_size(u->dependencies[UNIT_ON_FAILURE]) <= 0) + return; + + log_unit_info(u, "Triggering OnFailure= dependencies."); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_ON_FAILURE], i) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = manager_add_job(u->manager, JOB_START, other, u->on_failure_job_mode, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue OnFailure= job, ignoring: %s", bus_error_message(&error, r)); + } +} + +void unit_trigger_notify(Unit *u) { + Unit *other; + Iterator i; + void *v; + + assert(u); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_TRIGGERED_BY], i) + if (UNIT_VTABLE(other)->trigger_notify) + UNIT_VTABLE(other)->trigger_notify(other, u); +} + +static int unit_log_resources(Unit *u) { + struct iovec iovec[1 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + 4]; + bool any_traffic = false, have_ip_accounting = false; + _cleanup_free_ char *igress = NULL, *egress = NULL; + size_t n_message_parts = 0, n_iovec = 0; + char* message_parts[3 + 1], *t; + nsec_t nsec = NSEC_INFINITY; + CGroupIPAccountingMetric m; + size_t i; + int r; + const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES", + [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS", + [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES", + [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS", + }; + + assert(u); + + /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource + * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced + * information and the complete data in structured fields. */ + + (void) unit_get_cpu_usage(u, &nsec); + if (nsec != NSEC_INFINITY) { + char buf[FORMAT_TIMESPAN_MAX] = ""; + + /* Format the CPU time for inclusion in the structured log message */ + if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the CPU time for inclusion in the human language message string */ + format_timespan(buf, sizeof(buf), nsec / NSEC_PER_USEC, USEC_PER_MSEC); + t = strjoin("consumed ", buf, " CPU time"); + if (!t) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = t; + } + + for (m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + char buf[FORMAT_BYTES_MAX] = ""; + uint64_t value = UINT64_MAX; + + assert(ip_fields[m]); + + (void) unit_get_ip_accounting(u, m, &value); + if (value == UINT64_MAX) + continue; + + have_ip_accounting = true; + if (value > 0) + any_traffic = true; + + /* Format IP accounting data for inclusion in the structured log message */ + if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the IP accounting data for inclusion in the human language message string, but only for the + * bytes counters (and not for the packets counters) */ + if (m == CGROUP_IP_INGRESS_BYTES) { + assert(!igress); + igress = strjoin("received ", format_bytes(buf, sizeof(buf), value), " IP traffic"); + if (!igress) { + r = log_oom(); + goto finish; + } + } else if (m == CGROUP_IP_EGRESS_BYTES) { + assert(!egress); + egress = strjoin("sent ", format_bytes(buf, sizeof(buf), value), " IP traffic"); + if (!egress) { + r = log_oom(); + goto finish; + } + } + } + + if (have_ip_accounting) { + if (any_traffic) { + if (igress) + message_parts[n_message_parts++] = TAKE_PTR(igress); + if (egress) + message_parts[n_message_parts++] = TAKE_PTR(egress); + + } else { + char *k; + + k = strdup("no IP traffic"); + if (!k) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = k; + } + } + + /* Is there any accounting data available at all? */ + if (n_iovec == 0) { + r = 0; + goto finish; + } + + if (n_message_parts == 0) + t = strjoina("MESSAGE=", u->id, ": Completed."); + else { + _cleanup_free_ char *joined; + + message_parts[n_message_parts] = NULL; + + joined = strv_join(message_parts, ", "); + if (!joined) { + r = log_oom(); + goto finish; + } + + joined[0] = ascii_toupper(joined[0]); + t = strjoina("MESSAGE=", u->id, ": ", joined, "."); + } + + /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them, + * and hence don't increase n_iovec for them */ + iovec[n_iovec] = IOVEC_MAKE_STRING(t); + iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR); + + t = strjoina(u->manager->unit_log_field, u->id); + iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t); + + t = strjoina(u->manager->invocation_log_field, u->invocation_id_string); + iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t); + + log_struct_iovec(LOG_INFO, iovec, n_iovec + 4); + r = 0; + +finish: + for (i = 0; i < n_message_parts; i++) + free(message_parts[i]); + + for (i = 0; i < n_iovec; i++) + free(iovec[i].iov_base); + + return r; + +} + +static void unit_update_on_console(Unit *u) { + bool b; + + assert(u); + + b = unit_needs_console(u); + if (u->on_console == b) + return; + + u->on_console = b; + if (b) + manager_ref_console(u->manager); + else + manager_unref_console(u->manager); +} + +static void unit_emit_audit_start(Unit *u) { + assert(u); + + if (u->type != UNIT_SERVICE) + return; + + /* Write audit record if we have just finished starting up */ + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_START, true); + u->in_audit = true; +} + +static void unit_emit_audit_stop(Unit *u, UnitActiveState state) { + assert(u); + + if (u->type != UNIT_SERVICE) + return; + + if (u->in_audit) { + /* Write audit record if we have just finished shutting down */ + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_STOP, state == UNIT_INACTIVE); + u->in_audit = false; + } else { + /* Hmm, if there was no start record written write it now, so that we always have a nice pair */ + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_START, state == UNIT_INACTIVE); + + if (state == UNIT_INACTIVE) + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_STOP, true); + } +} + +static bool unit_process_job(Job *j, UnitActiveState ns, UnitNotifyFlags flags) { + bool unexpected = false; + + assert(j); + + if (j->state == JOB_WAITING) + + /* So we reached a different state for this job. Let's see if we can run it now if it failed previously + * due to EAGAIN. */ + job_add_to_run_queue(j); + + /* Let's check whether the unit's new state constitutes a finished job, or maybe contradicts a running job and + * hence needs to invalidate jobs. */ + + switch (j->type) { + + case JOB_START: + case JOB_VERIFY_ACTIVE: + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) + job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (j->state == JOB_RUNNING && ns != UNIT_ACTIVATING) { + unexpected = true; + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false); + } + + break; + + case JOB_RELOAD: + case JOB_RELOAD_OR_START: + case JOB_TRY_RELOAD: + + if (j->state == JOB_RUNNING) { + if (ns == UNIT_ACTIVE) + job_finish_and_invalidate(j, (flags & UNIT_NOTIFY_RELOAD_FAILURE) ? JOB_FAILED : JOB_DONE, true, false); + else if (!IN_SET(ns, UNIT_ACTIVATING, UNIT_RELOADING)) { + unexpected = true; + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false); + } + } + + break; + + case JOB_STOP: + case JOB_RESTART: + case JOB_TRY_RESTART: + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (j->state == JOB_RUNNING && ns != UNIT_DEACTIVATING) { + unexpected = true; + job_finish_and_invalidate(j, JOB_FAILED, true, false); + } + + break; + + default: + assert_not_reached("Job type unknown"); + } + + return unexpected; +} + +void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags) { + const char *reason; + Manager *m; + + assert(u); + assert(os < _UNIT_ACTIVE_STATE_MAX); + assert(ns < _UNIT_ACTIVE_STATE_MAX); + + /* Note that this is called for all low-level state changes, even if they might map to the same high-level + * UnitActiveState! That means that ns == os is an expected behavior here. For example: if a mount point is + * remounted this function will be called too! */ + + m = u->manager; + + /* Let's enqueue the change signal early. In case this unit has a job associated we want that this unit is in + * the bus queue, so that any job change signal queued will force out the unit change signal first. */ + unit_add_to_dbus_queue(u); + + /* Update timestamps for state changes */ + if (!MANAGER_IS_RELOADING(m)) { + dual_timestamp_get(&u->state_change_timestamp); + + if (UNIT_IS_INACTIVE_OR_FAILED(os) && !UNIT_IS_INACTIVE_OR_FAILED(ns)) + u->inactive_exit_timestamp = u->state_change_timestamp; + else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_INACTIVE_OR_FAILED(ns)) + u->inactive_enter_timestamp = u->state_change_timestamp; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(os) && UNIT_IS_ACTIVE_OR_RELOADING(ns)) + u->active_enter_timestamp = u->state_change_timestamp; + else if (UNIT_IS_ACTIVE_OR_RELOADING(os) && !UNIT_IS_ACTIVE_OR_RELOADING(ns)) + u->active_exit_timestamp = u->state_change_timestamp; + } + + /* Keep track of failed units */ + (void) manager_update_failed_units(m, u, ns == UNIT_FAILED); + + /* Make sure the cgroup and state files are always removed when we become inactive */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) { + unit_prune_cgroup(u); + unit_unlink_state_files(u); + } + + unit_update_on_console(u); + + if (!MANAGER_IS_RELOADING(m)) { + bool unexpected; + + /* Let's propagate state changes to the job */ + if (u->job) + unexpected = unit_process_job(u->job, ns, flags); + else + unexpected = true; + + /* If this state change happened without being requested by a job, then let's retroactively start or + * stop dependencies. We skip that step when deserializing, since we don't want to create any + * additional jobs just because something is already activated. */ + + if (unexpected) { + if (UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_ACTIVE_OR_ACTIVATING(ns)) + retroactively_start_dependencies(u); + else if (UNIT_IS_ACTIVE_OR_ACTIVATING(os) && UNIT_IS_INACTIVE_OR_DEACTIVATING(ns)) + retroactively_stop_dependencies(u); + } + + /* stop unneeded units regardless if going down was expected or not */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + check_unneeded_dependencies(u); + + if (ns != os && ns == UNIT_FAILED) { + log_unit_debug(u, "Unit entered failed state."); + + if (!(flags & UNIT_NOTIFY_WILL_AUTO_RESTART)) + unit_start_on_failure(u); + } + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns) && !UNIT_IS_ACTIVE_OR_RELOADING(os)) { + /* This unit just finished starting up */ + + unit_emit_audit_start(u); + manager_send_unit_plymouth(m, u); + } + + if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) { + /* This unit just stopped/failed. */ + + unit_emit_audit_stop(u, ns); + unit_log_resources(u); + } + } + + manager_recheck_journal(m); + manager_recheck_dbus(m); + + unit_trigger_notify(u); + + if (!MANAGER_IS_RELOADING(m)) { + /* Maybe we finished startup and are now ready for being stopped because unneeded? */ + unit_submit_to_stop_when_unneeded_queue(u); + + /* Maybe we finished startup, but something we needed has vanished? Let's die then. (This happens when + * something BindsTo= to a Type=oneshot unit, as these units go directly from starting to inactive, + * without ever entering started.) */ + unit_check_binds_to(u); + + if (os != UNIT_FAILED && ns == UNIT_FAILED) { + reason = strjoina("unit ", u->id, " failed"); + (void) emergency_action(m, u->failure_action, 0, u->reboot_arg, unit_failure_action_exit_status(u), reason); + } else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && ns == UNIT_INACTIVE) { + reason = strjoina("unit ", u->id, " succeeded"); + (void) emergency_action(m, u->success_action, 0, u->reboot_arg, unit_success_action_exit_status(u), reason); + } + } + + unit_add_to_gc_queue(u); +} + +int unit_watch_pid(Unit *u, pid_t pid) { + int r; + + assert(u); + assert(pid_is_valid(pid)); + + /* Watch a specific PID */ + + r = set_ensure_allocated(&u->pids, NULL); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&u->manager->watch_pids, NULL); + if (r < 0) + return r; + + /* First try, let's add the unit keyed by "pid". */ + r = hashmap_put(u->manager->watch_pids, PID_TO_PTR(pid), u); + if (r == -EEXIST) { + Unit **array; + bool found = false; + size_t n = 0; + + /* OK, the "pid" key is already assigned to a different unit. Let's see if the "-pid" key (which points + * to an array of Units rather than just a Unit), lists us already. */ + + array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid)); + if (array) + for (; array[n]; n++) + if (array[n] == u) + found = true; + + if (found) /* Found it already? if so, do nothing */ + r = 0; + else { + Unit **new_array; + + /* Allocate a new array */ + new_array = new(Unit*, n + 2); + if (!new_array) + return -ENOMEM; + + memcpy_safe(new_array, array, sizeof(Unit*) * n); + new_array[n] = u; + new_array[n+1] = NULL; + + /* Add or replace the old array */ + r = hashmap_replace(u->manager->watch_pids, PID_TO_PTR(-pid), new_array); + if (r < 0) { + free(new_array); + return r; + } + + free(array); + } + } else if (r < 0) + return r; + + r = set_put(u->pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + + return 0; +} + +void unit_unwatch_pid(Unit *u, pid_t pid) { + Unit **array; + + assert(u); + assert(pid_is_valid(pid)); + + /* First let's drop the unit in case it's keyed as "pid". */ + (void) hashmap_remove_value(u->manager->watch_pids, PID_TO_PTR(pid), u); + + /* Then, let's also drop the unit, in case it's in the array keyed by -pid */ + array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid)); + if (array) { + size_t n, m = 0; + + /* Let's iterate through the array, dropping our own entry */ + for (n = 0; array[n]; n++) + if (array[n] != u) + array[m++] = array[n]; + array[m] = NULL; + + if (m == 0) { + /* The array is now empty, remove the entire entry */ + assert(hashmap_remove(u->manager->watch_pids, PID_TO_PTR(-pid)) == array); + free(array); + } + } + + (void) set_remove(u->pids, PID_TO_PTR(pid)); +} + +void unit_unwatch_all_pids(Unit *u) { + assert(u); + + while (!set_isempty(u->pids)) + unit_unwatch_pid(u, PTR_TO_PID(set_first(u->pids))); + + u->pids = set_free(u->pids); +} + +static void unit_tidy_watch_pids(Unit *u) { + pid_t except1, except2; + Iterator i; + void *e; + + assert(u); + + /* Cleans dead PIDs from our list */ + + except1 = unit_main_pid(u); + except2 = unit_control_pid(u); + + SET_FOREACH(e, u->pids, i) { + pid_t pid = PTR_TO_PID(e); + + if (pid == except1 || pid == except2) + continue; + + if (!pid_is_unwaited(pid)) + unit_unwatch_pid(u, pid); + } +} + +static int on_rewatch_pids_event(sd_event_source *s, void *userdata) { + Unit *u = userdata; + + assert(s); + assert(u); + + unit_tidy_watch_pids(u); + unit_watch_all_pids(u); + + /* If the PID set is empty now, then let's finish this off. */ + unit_synthesize_cgroup_empty_event(u); + + return 0; +} + +int unit_enqueue_rewatch_pids(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new + * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it + * involves issuing kill(pid, 0) on all processes we watch. */ + + if (!u->rewatch_pids_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m"); + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: m"); + + (void) sd_event_source_set_description(s, "tidy-watch-pids"); + + u->rewatch_pids_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m"); + + return 0; +} + +void unit_dequeue_rewatch_pids(Unit *u) { + int r; + assert(u); + + if (!u->rewatch_pids_event_source) + return; + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF); + if (r < 0) + log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m"); + + u->rewatch_pids_event_source = sd_event_source_unref(u->rewatch_pids_event_source); +} + +bool unit_job_is_applicable(Unit *u, JobType j) { + assert(u); + assert(j >= 0 && j < _JOB_TYPE_MAX); + + switch (j) { + + case JOB_VERIFY_ACTIVE: + case JOB_START: + case JOB_NOP: + /* Note that we don't check unit_can_start() here. That's because .device units and suchlike are not + * startable by us but may appear due to external events, and it thus makes sense to permit enqueing + * jobs for it. */ + return true; + + case JOB_STOP: + /* Similar as above. However, perpetual units can never be stopped (neither explicitly nor due to + * external events), hence it makes no sense to permit enqueing such a request either. */ + return !u->perpetual; + + case JOB_RESTART: + case JOB_TRY_RESTART: + return unit_can_stop(u) && unit_can_start(u); + + case JOB_RELOAD: + case JOB_TRY_RELOAD: + return unit_can_reload(u); + + case JOB_RELOAD_OR_START: + return unit_can_reload(u) && unit_can_start(u); + + default: + assert_not_reached("Invalid job type"); + } +} + +static void maybe_warn_about_dependency(Unit *u, const char *other, UnitDependency dependency) { + assert(u); + + /* Only warn about some unit types */ + if (!IN_SET(dependency, UNIT_CONFLICTS, UNIT_CONFLICTED_BY, UNIT_BEFORE, UNIT_AFTER, UNIT_ON_FAILURE, UNIT_TRIGGERS, UNIT_TRIGGERED_BY)) + return; + + if (streq_ptr(u->id, other)) + log_unit_warning(u, "Dependency %s=%s dropped", unit_dependency_to_string(dependency), u->id); + else + log_unit_warning(u, "Dependency %s=%s dropped, merged into %s", unit_dependency_to_string(dependency), strna(other), u->id); +} + +static int unit_add_dependency_hashmap( + Hashmap **h, + Unit *other, + UnitDependencyMask origin_mask, + UnitDependencyMask destination_mask) { + + UnitDependencyInfo info; + int r; + + assert(h); + assert(other); + assert(origin_mask < _UNIT_DEPENDENCY_MASK_FULL); + assert(destination_mask < _UNIT_DEPENDENCY_MASK_FULL); + assert(origin_mask > 0 || destination_mask > 0); + + r = hashmap_ensure_allocated(h, NULL); + if (r < 0) + return r; + + assert_cc(sizeof(void*) == sizeof(info)); + + info.data = hashmap_get(*h, other); + if (info.data) { + /* Entry already exists. Add in our mask. */ + + if (FLAGS_SET(origin_mask, info.origin_mask) && + FLAGS_SET(destination_mask, info.destination_mask)) + return 0; /* NOP */ + + info.origin_mask |= origin_mask; + info.destination_mask |= destination_mask; + + r = hashmap_update(*h, other, info.data); + } else { + info = (UnitDependencyInfo) { + .origin_mask = origin_mask, + .destination_mask = destination_mask, + }; + + r = hashmap_put(*h, other, info.data); + } + if (r < 0) + return r; + + return 1; +} + +int unit_add_dependency( + Unit *u, + UnitDependency d, + Unit *other, + bool add_reference, + UnitDependencyMask mask) { + + static const UnitDependency inverse_table[_UNIT_DEPENDENCY_MAX] = { + [UNIT_REQUIRES] = UNIT_REQUIRED_BY, + [UNIT_WANTS] = UNIT_WANTED_BY, + [UNIT_REQUISITE] = UNIT_REQUISITE_OF, + [UNIT_BINDS_TO] = UNIT_BOUND_BY, + [UNIT_PART_OF] = UNIT_CONSISTS_OF, + [UNIT_REQUIRED_BY] = UNIT_REQUIRES, + [UNIT_REQUISITE_OF] = UNIT_REQUISITE, + [UNIT_WANTED_BY] = UNIT_WANTS, + [UNIT_BOUND_BY] = UNIT_BINDS_TO, + [UNIT_CONSISTS_OF] = UNIT_PART_OF, + [UNIT_CONFLICTS] = UNIT_CONFLICTED_BY, + [UNIT_CONFLICTED_BY] = UNIT_CONFLICTS, + [UNIT_BEFORE] = UNIT_AFTER, + [UNIT_AFTER] = UNIT_BEFORE, + [UNIT_ON_FAILURE] = _UNIT_DEPENDENCY_INVALID, + [UNIT_REFERENCES] = UNIT_REFERENCED_BY, + [UNIT_REFERENCED_BY] = UNIT_REFERENCES, + [UNIT_TRIGGERS] = UNIT_TRIGGERED_BY, + [UNIT_TRIGGERED_BY] = UNIT_TRIGGERS, + [UNIT_PROPAGATES_RELOAD_TO] = UNIT_RELOAD_PROPAGATED_FROM, + [UNIT_RELOAD_PROPAGATED_FROM] = UNIT_PROPAGATES_RELOAD_TO, + [UNIT_JOINS_NAMESPACE_OF] = UNIT_JOINS_NAMESPACE_OF, + }; + Unit *original_u = u, *original_other = other; + int r; + + assert(u); + assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX); + assert(other); + + u = unit_follow_merge(u); + other = unit_follow_merge(other); + + /* We won't allow dependencies on ourselves. We will not + * consider them an error however. */ + if (u == other) { + maybe_warn_about_dependency(original_u, original_other->id, d); + return 0; + } + + if ((d == UNIT_BEFORE && other->type == UNIT_DEVICE) || + (d == UNIT_AFTER && u->type == UNIT_DEVICE)) { + log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id); + return 0; + } + + r = unit_add_dependency_hashmap(u->dependencies + d, other, mask, 0); + if (r < 0) + return r; + + if (inverse_table[d] != _UNIT_DEPENDENCY_INVALID && inverse_table[d] != d) { + r = unit_add_dependency_hashmap(other->dependencies + inverse_table[d], u, 0, mask); + if (r < 0) + return r; + } + + if (add_reference) { + r = unit_add_dependency_hashmap(u->dependencies + UNIT_REFERENCES, other, mask, 0); + if (r < 0) + return r; + + r = unit_add_dependency_hashmap(other->dependencies + UNIT_REFERENCED_BY, u, 0, mask); + if (r < 0) + return r; + } + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask) { + int r; + + assert(u); + + r = unit_add_dependency(u, d, other, add_reference, mask); + if (r < 0) + return r; + + return unit_add_dependency(u, e, other, add_reference, mask); +} + +static int resolve_template(Unit *u, const char *name, char **buf, const char **ret) { + int r; + + assert(u); + assert(name); + assert(buf); + assert(ret); + + if (!unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + *buf = NULL; + *ret = name; + return 0; + } + + if (u->instance) + r = unit_name_replace_instance(name, u->instance, buf); + else { + _cleanup_free_ char *i = NULL; + + r = unit_name_to_prefix(u->id, &i); + if (r < 0) + return r; + + r = unit_name_replace_instance(name, i, buf); + } + if (r < 0) + return r; + + *ret = *buf; + return 0; +} + +int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask) { + _cleanup_free_ char *buf = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + r = resolve_template(u, name, &buf, &name); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, name, NULL, NULL, &other); + if (r < 0) + return r; + + return unit_add_dependency(u, d, other, add_reference, mask); +} + +int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask) { + _cleanup_free_ char *buf = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + r = resolve_template(u, name, &buf, &name); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, name, NULL, NULL, &other); + if (r < 0) + return r; + + return unit_add_two_dependencies(u, d, e, other, add_reference, mask); +} + +int set_unit_path(const char *p) { + /* This is mostly for debug purposes */ + if (setenv("SYSTEMD_UNIT_PATH", p, 1) < 0) + return -errno; + + return 0; +} + +char *unit_dbus_path(Unit *u) { + assert(u); + + if (!u->id) + return NULL; + + return unit_dbus_path_from_name(u->id); +} + +char *unit_dbus_path_invocation_id(Unit *u) { + assert(u); + + if (sd_id128_is_null(u->invocation_id)) + return NULL; + + return unit_dbus_path_from_name(u->invocation_id_string); +} + +int unit_set_slice(Unit *u, Unit *slice) { + assert(u); + assert(slice); + + /* Sets the unit slice if it has not been set before. Is extra + * careful, to only allow this for units that actually have a + * cgroup context. Also, we don't allow to set this for slices + * (since the parent slice is derived from the name). Make + * sure the unit we set is actually a slice. */ + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EOPNOTSUPP; + + if (u->type == UNIT_SLICE) + return -EINVAL; + + if (unit_active_state(u) != UNIT_INACTIVE) + return -EBUSY; + + if (slice->type != UNIT_SLICE) + return -EINVAL; + + if (unit_has_name(u, SPECIAL_INIT_SCOPE) && + !unit_has_name(slice, SPECIAL_ROOT_SLICE)) + return -EPERM; + + if (UNIT_DEREF(u->slice) == slice) + return 0; + + /* Disallow slice changes if @u is already bound to cgroups */ + if (UNIT_ISSET(u->slice) && u->cgroup_realized) + return -EBUSY; + + unit_ref_set(&u->slice, u, slice); + return 1; +} + +int unit_set_default_slice(Unit *u) { + const char *slice_name; + Unit *slice; + int r; + + assert(u); + + if (UNIT_ISSET(u->slice)) + return 0; + + if (u->instance) { + _cleanup_free_ char *prefix = NULL, *escaped = NULL; + + /* Implicitly place all instantiated units in their + * own per-template slice */ + + r = unit_name_to_prefix(u->id, &prefix); + if (r < 0) + return r; + + /* The prefix is already escaped, but it might include + * "-" which has a special meaning for slice units, + * hence escape it here extra. */ + escaped = unit_name_escape(prefix); + if (!escaped) + return -ENOMEM; + + if (MANAGER_IS_SYSTEM(u->manager)) + slice_name = strjoina("system-", escaped, ".slice"); + else + slice_name = strjoina(escaped, ".slice"); + } else + slice_name = + MANAGER_IS_SYSTEM(u->manager) && !unit_has_name(u, SPECIAL_INIT_SCOPE) + ? SPECIAL_SYSTEM_SLICE + : SPECIAL_ROOT_SLICE; + + r = manager_load_unit(u->manager, slice_name, NULL, NULL, &slice); + if (r < 0) + return r; + + return unit_set_slice(u, slice); +} + +const char *unit_slice_name(Unit *u) { + assert(u); + + if (!UNIT_ISSET(u->slice)) + return NULL; + + return UNIT_DEREF(u->slice)->id; +} + +int unit_load_related_unit(Unit *u, const char *type, Unit **_found) { + _cleanup_free_ char *t = NULL; + int r; + + assert(u); + assert(type); + assert(_found); + + r = unit_name_change_suffix(u->id, type, &t); + if (r < 0) + return r; + if (unit_has_name(u, t)) + return -EINVAL; + + r = manager_load_unit(u->manager, t, NULL, NULL, _found); + assert(r < 0 || *_found != u); + return r; +} + +static int signal_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *name, *old_owner, *new_owner; + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = sd_bus_message_read(message, "sss", &name, &old_owner, &new_owner); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + old_owner = empty_to_null(old_owner); + new_owner = empty_to_null(new_owner); + + if (UNIT_VTABLE(u)->bus_name_owner_change) + UNIT_VTABLE(u)->bus_name_owner_change(u, name, old_owner, new_owner); + + return 0; +} + +int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name) { + const char *match; + + assert(u); + assert(bus); + assert(name); + + if (u->match_bus_slot) + return -EBUSY; + + match = strjoina("type='signal'," + "sender='org.freedesktop.DBus'," + "path='/org/freedesktop/DBus'," + "interface='org.freedesktop.DBus'," + "member='NameOwnerChanged'," + "arg0='", name, "'"); + + return sd_bus_add_match_async(bus, &u->match_bus_slot, match, signal_name_owner_changed, NULL, u); +} + +int unit_watch_bus_name(Unit *u, const char *name) { + int r; + + assert(u); + assert(name); + + /* Watch a specific name on the bus. We only support one unit + * watching each name for now. */ + + if (u->manager->api_bus) { + /* If the bus is already available, install the match directly. + * Otherwise, just put the name in the list. bus_setup_api() will take care later. */ + r = unit_install_bus_match(u, u->manager->api_bus, name); + if (r < 0) + return log_warning_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name); + } + + r = hashmap_put(u->manager->watch_bus, name, u); + if (r < 0) { + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + return log_warning_errno(r, "Failed to put bus name to hashmap: %m"); + } + + return 0; +} + +void unit_unwatch_bus_name(Unit *u, const char *name) { + assert(u); + assert(name); + + (void) hashmap_remove_value(u->manager->watch_bus, name, u); + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); +} + +bool unit_can_serialize(Unit *u) { + assert(u); + + return UNIT_VTABLE(u)->serialize && UNIT_VTABLE(u)->deserialize_item; +} + +static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) { + _cleanup_free_ char *s = NULL; + int r; + + assert(f); + assert(key); + + if (mask == 0) + return 0; + + r = cg_mask_to_string(mask, &s); + if (r < 0) + return log_error_errno(r, "Failed to format cgroup mask: %m"); + + return serialize_item(f, key, s); +} + +static const char *ip_accounting_metric_field[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes", + [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets", + [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes", + [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets", +}; + +int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { + CGroupIPAccountingMetric m; + int r; + + assert(u); + assert(f); + assert(fds); + + if (unit_can_serialize(u)) { + r = UNIT_VTABLE(u)->serialize(u, f, fds); + if (r < 0) + return r; + } + + (void) serialize_dual_timestamp(f, "state-change-timestamp", &u->state_change_timestamp); + + (void) serialize_dual_timestamp(f, "inactive-exit-timestamp", &u->inactive_exit_timestamp); + (void) serialize_dual_timestamp(f, "active-enter-timestamp", &u->active_enter_timestamp); + (void) serialize_dual_timestamp(f, "active-exit-timestamp", &u->active_exit_timestamp); + (void) serialize_dual_timestamp(f, "inactive-enter-timestamp", &u->inactive_enter_timestamp); + + (void) serialize_dual_timestamp(f, "condition-timestamp", &u->condition_timestamp); + (void) serialize_dual_timestamp(f, "assert-timestamp", &u->assert_timestamp); + + if (dual_timestamp_is_set(&u->condition_timestamp)) + (void) serialize_bool(f, "condition-result", u->condition_result); + + if (dual_timestamp_is_set(&u->assert_timestamp)) + (void) serialize_bool(f, "assert-result", u->assert_result); + + (void) serialize_bool(f, "transient", u->transient); + (void) serialize_bool(f, "in-audit", u->in_audit); + + (void) serialize_bool(f, "exported-invocation-id", u->exported_invocation_id); + (void) serialize_bool(f, "exported-log-level-max", u->exported_log_level_max); + (void) serialize_bool(f, "exported-log-extra-fields", u->exported_log_extra_fields); + (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_rate_limit_interval); + (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_rate_limit_burst); + + (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base); + if (u->cpu_usage_last != NSEC_INFINITY) + (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last); + + if (u->cgroup_path) + (void) serialize_item(f, "cgroup", u->cgroup_path); + + (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized); + (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask); + (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask); + (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask); + + if (uid_is_valid(u->ref_uid)) + (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid); + if (gid_is_valid(u->ref_gid)) + (void) serialize_item_format(f, "ref-gid", GID_FMT, u->ref_gid); + + if (!sd_id128_is_null(u->invocation_id)) + (void) serialize_item_format(f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)); + + bus_track_serialize(u->bus_track, f, "ref"); + + for (m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + uint64_t v; + + r = unit_get_ip_accounting(u, m, &v); + if (r >= 0) + (void) serialize_item_format(f, ip_accounting_metric_field[m], "%" PRIu64, v); + } + + if (serialize_jobs) { + if (u->job) { + fputs("job\n", f); + job_serialize(u->job, f); + } + + if (u->nop_job) { + fputs("job\n", f); + job_serialize(u->nop_job, f); + } + } + + /* End marker */ + fputc('\n', f); + return 0; +} + +static int unit_deserialize_job(Unit *u, FILE *f) { + _cleanup_(job_freep) Job *j = NULL; + int r; + + assert(u); + assert(f); + + j = job_new_raw(u); + if (!j) + return log_oom(); + + r = job_deserialize(j, f); + if (r < 0) + return r; + + r = job_install_deserialized(j); + if (r < 0) + return r; + + TAKE_PTR(j); + return 0; +} + +int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + assert(f); + assert(fds); + + for (;;) { + _cleanup_free_ char *line = NULL; + CGroupIPAccountingMetric m; + char *l, *v; + size_t k; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) /* eof */ + break; + + l = strstrip(line); + if (isempty(l)) /* End marker */ + break; + + k = strcspn(l, "="); + + if (l[k] == '=') { + l[k] = 0; + v = l+k+1; + } else + v = l+k; + + if (streq(l, "job")) { + if (v[0] == '\0') { + /* New-style serialized job */ + r = unit_deserialize_job(u, f); + if (r < 0) + return r; + } else /* Legacy for pre-44 */ + log_unit_warning(u, "Update from too old systemd versions are unsupported, cannot deserialize job: %s", v); + continue; + } else if (streq(l, "state-change-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->state_change_timestamp); + continue; + } else if (streq(l, "inactive-exit-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->inactive_exit_timestamp); + continue; + } else if (streq(l, "active-enter-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->active_enter_timestamp); + continue; + } else if (streq(l, "active-exit-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->active_exit_timestamp); + continue; + } else if (streq(l, "inactive-enter-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->inactive_enter_timestamp); + continue; + } else if (streq(l, "condition-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->condition_timestamp); + continue; + } else if (streq(l, "assert-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->assert_timestamp); + continue; + } else if (streq(l, "condition-result")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse condition result value %s, ignoring.", v); + else + u->condition_result = r; + + continue; + + } else if (streq(l, "assert-result")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse assert result value %s, ignoring.", v); + else + u->assert_result = r; + + continue; + + } else if (streq(l, "transient")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse transient bool %s, ignoring.", v); + else + u->transient = r; + + continue; + + } else if (streq(l, "in-audit")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse in-audit bool %s, ignoring.", v); + else + u->in_audit = r; + + continue; + + } else if (streq(l, "exported-invocation-id")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported invocation ID bool %s, ignoring.", v); + else + u->exported_invocation_id = r; + + continue; + + } else if (streq(l, "exported-log-level-max")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log level max bool %s, ignoring.", v); + else + u->exported_log_level_max = r; + + continue; + + } else if (streq(l, "exported-log-extra-fields")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log extra fields bool %s, ignoring.", v); + else + u->exported_log_extra_fields = r; + + continue; + + } else if (streq(l, "exported-log-rate-limit-interval")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log rate limit interval %s, ignoring.", v); + else + u->exported_log_rate_limit_interval = r; + + continue; + + } else if (streq(l, "exported-log-rate-limit-burst")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log rate limit burst %s, ignoring.", v); + else + u->exported_log_rate_limit_burst = r; + + continue; + + } else if (STR_IN_SET(l, "cpu-usage-base", "cpuacct-usage-base")) { + + r = safe_atou64(v, &u->cpu_usage_base); + if (r < 0) + log_unit_debug(u, "Failed to parse CPU usage base %s, ignoring.", v); + + continue; + + } else if (streq(l, "cpu-usage-last")) { + + r = safe_atou64(v, &u->cpu_usage_last); + if (r < 0) + log_unit_debug(u, "Failed to read CPU usage last %s, ignoring.", v); + + continue; + + } else if (streq(l, "cgroup")) { + + r = unit_set_cgroup_path(u, v); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v); + + (void) unit_watch_cgroup(u); + + continue; + } else if (streq(l, "cgroup-realized")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_unit_debug(u, "Failed to parse cgroup-realized bool %s, ignoring.", v); + else + u->cgroup_realized = b; + + continue; + + } else if (streq(l, "cgroup-realized-mask")) { + + r = cg_mask_from_string(v, &u->cgroup_realized_mask); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup-realized-mask %s, ignoring.", v); + continue; + + } else if (streq(l, "cgroup-enabled-mask")) { + + r = cg_mask_from_string(v, &u->cgroup_enabled_mask); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup-enabled-mask %s, ignoring.", v); + continue; + + } else if (streq(l, "cgroup-invalidated-mask")) { + + r = cg_mask_from_string(v, &u->cgroup_invalidated_mask); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup-invalidated-mask %s, ignoring.", v); + continue; + + } else if (streq(l, "ref-uid")) { + uid_t uid; + + r = parse_uid(v, &uid); + if (r < 0) + log_unit_debug(u, "Failed to parse referenced UID %s, ignoring.", v); + else + unit_ref_uid_gid(u, uid, GID_INVALID); + + continue; + + } else if (streq(l, "ref-gid")) { + gid_t gid; + + r = parse_gid(v, &gid); + if (r < 0) + log_unit_debug(u, "Failed to parse referenced GID %s, ignoring.", v); + else + unit_ref_uid_gid(u, UID_INVALID, gid); + + continue; + + } else if (streq(l, "ref")) { + + r = strv_extend(&u->deserialized_refs, v); + if (r < 0) + return log_oom(); + + continue; + } else if (streq(l, "invocation-id")) { + sd_id128_t id; + + r = sd_id128_from_string(v, &id); + if (r < 0) + log_unit_debug(u, "Failed to parse invocation id %s, ignoring.", v); + else { + r = unit_set_invocation_id(u, id); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m"); + } + + continue; + } + + /* Check if this is an IP accounting metric serialization field */ + for (m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) + if (streq(l, ip_accounting_metric_field[m])) + break; + if (m < _CGROUP_IP_ACCOUNTING_METRIC_MAX) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v); + else + u->ip_accounting_extra[m] = c; + continue; + } + + if (unit_can_serialize(u)) { + r = exec_runtime_deserialize_compat(u, l, v, fds); + if (r < 0) { + log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l); + continue; + } + + /* Returns positive if key was handled by the call */ + if (r > 0) + continue; + + r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds); + if (r < 0) + log_unit_warning(u, "Failed to deserialize unit parameter '%s', ignoring.", l); + } + } + + /* Versions before 228 did not carry a state change timestamp. In this case, take the current time. This is + * useful, so that timeouts based on this timestamp don't trigger too early, and is in-line with the logic from + * before 228 where the base for timeouts was not persistent across reboots. */ + + if (!dual_timestamp_is_set(&u->state_change_timestamp)) + dual_timestamp_get(&u->state_change_timestamp); + + /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings applied + * after we are done. For that we invalidate anything already realized, so that we can realize it again. */ + unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); + unit_invalidate_cgroup_bpf(u); + + return 0; +} + +int unit_deserialize_skip(FILE *f) { + int r; + assert(f); + + /* Skip serialized data for this unit. We don't know what it is. */ + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + return 0; + + l = strstrip(line); + + /* End marker */ + if (isempty(l)) + return 1; + } +} + +int unit_add_node_dependency(Unit *u, const char *what, bool wants, UnitDependency dep, UnitDependencyMask mask) { + Unit *device; + _cleanup_free_ char *e = NULL; + int r; + + assert(u); + + /* Adds in links to the device node that this unit is based on */ + if (isempty(what)) + return 0; + + if (!is_device_path(what)) + return 0; + + /* When device units aren't supported (such as in a + * container), don't create dependencies on them. */ + if (!unit_type_supported(UNIT_DEVICE)) + return 0; + + r = unit_name_from_path(what, ".device", &e); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, e, NULL, NULL, &device); + if (r < 0) + return r; + + if (dep == UNIT_REQUIRES && device_shall_be_bound_by(device, u)) + dep = UNIT_BINDS_TO; + + r = unit_add_two_dependencies(u, UNIT_AFTER, + MANAGER_IS_SYSTEM(u->manager) ? dep : UNIT_WANTS, + device, true, mask); + if (r < 0) + return r; + + if (wants) { + r = unit_add_dependency(device, UNIT_WANTS, u, false, mask); + if (r < 0) + return r; + } + + return 0; +} + +int unit_coldplug(Unit *u) { + int r = 0, q; + char **i; + + assert(u); + + /* Make sure we don't enter a loop, when coldplugging recursively. */ + if (u->coldplugged) + return 0; + + u->coldplugged = true; + + STRV_FOREACH(i, u->deserialized_refs) { + q = bus_unit_track_add_name(u, *i); + if (q < 0 && r >= 0) + r = q; + } + u->deserialized_refs = strv_free(u->deserialized_refs); + + if (UNIT_VTABLE(u)->coldplug) { + q = UNIT_VTABLE(u)->coldplug(u); + if (q < 0 && r >= 0) + r = q; + } + + if (u->job) { + q = job_coldplug(u->job); + if (q < 0 && r >= 0) + r = q; + } + + return r; +} + +void unit_catchup(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->catchup) + UNIT_VTABLE(u)->catchup(u); +} + +static bool fragment_mtime_newer(const char *path, usec_t mtime, bool path_masked) { + struct stat st; + + if (!path) + return false; + + /* If the source is some virtual kernel file system, then we assume we watch it anyway, and hence pretend we + * are never out-of-date. */ + if (PATH_STARTSWITH_SET(path, "/proc", "/sys")) + return false; + + if (stat(path, &st) < 0) + /* What, cannot access this anymore? */ + return true; + + if (path_masked) + /* For masked files check if they are still so */ + return !null_or_empty(&st); + else + /* For non-empty files check the mtime */ + return timespec_load(&st.st_mtim) > mtime; + + return false; +} + +bool unit_need_daemon_reload(Unit *u) { + _cleanup_strv_free_ char **t = NULL; + char **path; + + assert(u); + + /* For unit files, we allow masking… */ + if (fragment_mtime_newer(u->fragment_path, u->fragment_mtime, + u->load_state == UNIT_MASKED)) + return true; + + /* Source paths should not be masked… */ + if (fragment_mtime_newer(u->source_path, u->source_mtime, false)) + return true; + + if (u->load_state == UNIT_LOADED) + (void) unit_find_dropin_paths(u, &t); + if (!strv_equal(u->dropin_paths, t)) + return true; + + /* … any drop-ins that are masked are simply omitted from the list. */ + STRV_FOREACH(path, u->dropin_paths) + if (fragment_mtime_newer(*path, u->dropin_mtime, false)) + return true; + + return false; +} + +void unit_reset_failed(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->reset_failed) + UNIT_VTABLE(u)->reset_failed(u); + + RATELIMIT_RESET(u->start_limit); + u->start_limit_hit = false; +} + +Unit *unit_following(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->following) + return UNIT_VTABLE(u)->following(u); + + return NULL; +} + +bool unit_stop_pending(Unit *u) { + assert(u); + + /* This call does check the current state of the unit. It's + * hence useful to be called from state change calls of the + * unit itself, where the state isn't updated yet. This is + * different from unit_inactive_or_pending() which checks both + * the current state and for a queued job. */ + + return u->job && u->job->type == JOB_STOP; +} + +bool unit_inactive_or_pending(Unit *u) { + assert(u); + + /* Returns true if the unit is inactive or going down */ + + if (UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u))) + return true; + + if (unit_stop_pending(u)) + return true; + + return false; +} + +bool unit_active_or_pending(Unit *u) { + assert(u); + + /* Returns true if the unit is active or going up */ + + if (UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u))) + return true; + + if (u->job && + IN_SET(u->job->type, JOB_START, JOB_RELOAD_OR_START, JOB_RESTART)) + return true; + + return false; +} + +bool unit_will_restart(Unit *u) { + assert(u); + + if (!UNIT_VTABLE(u)->will_restart) + return false; + + return UNIT_VTABLE(u)->will_restart(u); +} + +int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error) { + assert(u); + assert(w >= 0 && w < _KILL_WHO_MAX); + assert(SIGNAL_VALID(signo)); + + if (!UNIT_VTABLE(u)->kill) + return -EOPNOTSUPP; + + return UNIT_VTABLE(u)->kill(u, w, signo, error); +} + +static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) { + _cleanup_set_free_ Set *pid_set = NULL; + int r; + + pid_set = set_new(NULL); + if (!pid_set) + return NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + if (main_pid > 0) { + r = set_put(pid_set, PID_TO_PTR(main_pid)); + if (r < 0) + return NULL; + } + + if (control_pid > 0) { + r = set_put(pid_set, PID_TO_PTR(control_pid)); + if (r < 0) + return NULL; + } + + return TAKE_PTR(pid_set); +} + +int unit_kill_common( + Unit *u, + KillWho who, + int signo, + pid_t main_pid, + pid_t control_pid, + sd_bus_error *error) { + + int r = 0; + bool killed = false; + + if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) { + if (main_pid < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type)); + else if (main_pid == 0) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill"); + } + + if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) { + if (control_pid < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type)); + else if (control_pid == 0) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill"); + } + + if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) + if (control_pid > 0) { + if (kill(control_pid, signo) < 0) + r = -errno; + else + killed = true; + } + + if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) + if (main_pid > 0) { + if (kill(main_pid, signo) < 0) + r = -errno; + else + killed = true; + } + + if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path) { + _cleanup_set_free_ Set *pid_set = NULL; + int q; + + /* Exclude the main/control pids from being killed via the cgroup */ + pid_set = unit_pid_set(main_pid, control_pid); + if (!pid_set) + return -ENOMEM; + + q = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, signo, 0, pid_set, NULL, NULL); + if (q < 0 && !IN_SET(q, -EAGAIN, -ESRCH, -ENOENT)) + r = q; + else + killed = true; + } + + if (r == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL)) + return -ESRCH; + + return r; +} + +int unit_following_set(Unit *u, Set **s) { + assert(u); + assert(s); + + if (UNIT_VTABLE(u)->following_set) + return UNIT_VTABLE(u)->following_set(u, s); + + *s = NULL; + return 0; +} + +UnitFileState unit_get_unit_file_state(Unit *u) { + int r; + + assert(u); + + if (u->unit_file_state < 0 && u->fragment_path) { + r = unit_file_get_state( + u->manager->unit_file_scope, + NULL, + u->id, + &u->unit_file_state); + if (r < 0) + u->unit_file_state = UNIT_FILE_BAD; + } + + return u->unit_file_state; +} + +int unit_get_unit_file_preset(Unit *u) { + assert(u); + + if (u->unit_file_preset < 0 && u->fragment_path) + u->unit_file_preset = unit_file_query_preset( + u->manager->unit_file_scope, + NULL, + basename(u->fragment_path)); + + return u->unit_file_preset; +} + +Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target) { + assert(ref); + assert(source); + assert(target); + + if (ref->target) + unit_ref_unset(ref); + + ref->source = source; + ref->target = target; + LIST_PREPEND(refs_by_target, target->refs_by_target, ref); + return target; +} + +void unit_ref_unset(UnitRef *ref) { + assert(ref); + + if (!ref->target) + return; + + /* We are about to drop a reference to the unit, make sure the garbage collection has a look at it as it might + * be unreferenced now. */ + unit_add_to_gc_queue(ref->target); + + LIST_REMOVE(refs_by_target, ref->target->refs_by_target, ref); + ref->source = ref->target = NULL; +} + +static int user_from_unit_name(Unit *u, char **ret) { + + static const uint8_t hash_key[] = { + 0x58, 0x1a, 0xaf, 0xe6, 0x28, 0x58, 0x4e, 0x96, + 0xb4, 0x4e, 0xf5, 0x3b, 0x8c, 0x92, 0x07, 0xec + }; + + _cleanup_free_ char *n = NULL; + int r; + + r = unit_name_to_prefix(u->id, &n); + if (r < 0) + return r; + + if (valid_user_group_name(n)) { + *ret = TAKE_PTR(n); + return 0; + } + + /* If we can't use the unit name as a user name, then let's hash it and use that */ + if (asprintf(ret, "_du%016" PRIx64, siphash24(n, strlen(n), hash_key)) < 0) + return -ENOMEM; + + return 0; +} + +int unit_patch_contexts(Unit *u) { + CGroupContext *cc; + ExecContext *ec; + unsigned i; + int r; + + assert(u); + + /* Patch in the manager defaults into the exec and cgroup + * contexts, _after_ the rest of the settings have been + * initialized */ + + ec = unit_get_exec_context(u); + if (ec) { + /* This only copies in the ones that need memory */ + for (i = 0; i < _RLIMIT_MAX; i++) + if (u->manager->rlimit[i] && !ec->rlimit[i]) { + ec->rlimit[i] = newdup(struct rlimit, u->manager->rlimit[i], 1); + if (!ec->rlimit[i]) + return -ENOMEM; + } + + if (MANAGER_IS_USER(u->manager) && + !ec->working_directory) { + + r = get_home_dir(&ec->working_directory); + if (r < 0) + return r; + + /* Allow user services to run, even if the + * home directory is missing */ + ec->working_directory_missing_ok = true; + } + + if (ec->private_devices) + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); + + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + + if (ec->dynamic_user) { + if (!ec->user) { + r = user_from_unit_name(u, &ec->user); + if (r < 0) + return r; + } + + if (!ec->group) { + ec->group = strdup(ec->user); + if (!ec->group) + return -ENOMEM; + } + + /* If the dynamic user option is on, let's make sure that the unit can't leave its UID/GID + * around in the file system or on IPC objects. Hence enforce a strict sandbox. */ + + ec->private_tmp = true; + ec->remove_ipc = true; + ec->protect_system = PROTECT_SYSTEM_STRICT; + if (ec->protect_home == PROTECT_HOME_NO) + ec->protect_home = PROTECT_HOME_READ_ONLY; + } + } + + cc = unit_get_cgroup_context(u); + if (cc && ec) { + + if (ec->private_devices && + cc->device_policy == CGROUP_AUTO) + cc->device_policy = CGROUP_CLOSED; + + if (ec->root_image && + (cc->device_policy != CGROUP_AUTO || cc->device_allow)) { + + /* When RootImage= is specified, the following devices are touched. */ + r = cgroup_add_device_allow(cc, "/dev/loop-control", "rw"); + if (r < 0) + return r; + + r = cgroup_add_device_allow(cc, "block-loop", "rwm"); + if (r < 0) + return r; + + r = cgroup_add_device_allow(cc, "block-blkext", "rwm"); + if (r < 0) + return r; + } + } + + return 0; +} + +ExecContext *unit_get_exec_context(Unit *u) { + size_t offset; + assert(u); + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->exec_context_offset; + if (offset <= 0) + return NULL; + + return (ExecContext*) ((uint8_t*) u + offset); +} + +KillContext *unit_get_kill_context(Unit *u) { + size_t offset; + assert(u); + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->kill_context_offset; + if (offset <= 0) + return NULL; + + return (KillContext*) ((uint8_t*) u + offset); +} + +CGroupContext *unit_get_cgroup_context(Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->cgroup_context_offset; + if (offset <= 0) + return NULL; + + return (CGroupContext*) ((uint8_t*) u + offset); +} + +ExecRuntime *unit_get_exec_runtime(Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->exec_runtime_offset; + if (offset <= 0) + return NULL; + + return *(ExecRuntime**) ((uint8_t*) u + offset); +} + +static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) { + assert(u); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return NULL; + + if (u->transient) /* Redirect drop-ins for transient units always into the transient directory. */ + return u->manager->lookup_paths.transient; + + if (flags & UNIT_PERSISTENT) + return u->manager->lookup_paths.persistent_control; + + if (flags & UNIT_RUNTIME) + return u->manager->lookup_paths.runtime_control; + + return NULL; +} + +char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf) { + char *ret = NULL; + + if (!s) + return NULL; + + /* Escapes the input string as requested. Returns the escaped string. If 'buf' is specified then the allocated + * return buffer pointer is also written to *buf, except if no escaping was necessary, in which case *buf is + * set to NULL, and the input pointer is returned as-is. This means the return value always contains a properly + * escaped version, but *buf when passed only contains a pointer if an allocation was necessary. If *buf is + * not specified, then the return value always needs to be freed. Callers can use this to optimize memory + * allocations. */ + + if (flags & UNIT_ESCAPE_SPECIFIERS) { + ret = specifier_escape(s); + if (!ret) + return NULL; + + s = ret; + } + + if (flags & UNIT_ESCAPE_C) { + char *a; + + a = cescape(s); + free(ret); + if (!a) + return NULL; + + ret = a; + } + + if (buf) { + *buf = ret; + return ret ?: (char*) s; + } + + return ret ?: strdup(s); +} + +char* unit_concat_strv(char **l, UnitWriteFlags flags) { + _cleanup_free_ char *result = NULL; + size_t n = 0, allocated = 0; + char **i; + + /* Takes a list of strings, escapes them, and concatenates them. This may be used to format command lines in a + * way suitable for ExecStart= stanzas */ + + STRV_FOREACH(i, l) { + _cleanup_free_ char *buf = NULL; + const char *p; + size_t a; + char *q; + + p = unit_escape_setting(*i, flags, &buf); + if (!p) + return NULL; + + a = (n > 0) + 1 + strlen(p) + 1; /* separating space + " + entry + " */ + if (!GREEDY_REALLOC(result, allocated, n + a + 1)) + return NULL; + + q = result + n; + if (n > 0) + *(q++) = ' '; + + *(q++) = '"'; + q = stpcpy(q, p); + *(q++) = '"'; + + n += a; + } + + if (!GREEDY_REALLOC(result, allocated, n + 1)) + return NULL; + + result[n] = 0; + + return TAKE_PTR(result); +} + +int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data) { + _cleanup_free_ char *p = NULL, *q = NULL, *escaped = NULL; + const char *dir, *wrapped; + int r; + + assert(u); + assert(name); + assert(data); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return 0; + + data = unit_escape_setting(data, flags, &escaped); + if (!data) + return -ENOMEM; + + /* Prefix the section header. If we are writing this out as transient file, then let's suppress this if the + * previous section header is the same */ + + if (flags & UNIT_PRIVATE) { + if (!UNIT_VTABLE(u)->private_section) + return -EINVAL; + + if (!u->transient_file || u->last_section_private < 0) + data = strjoina("[", UNIT_VTABLE(u)->private_section, "]\n", data); + else if (u->last_section_private == 0) + data = strjoina("\n[", UNIT_VTABLE(u)->private_section, "]\n", data); + } else { + if (!u->transient_file || u->last_section_private < 0) + data = strjoina("[Unit]\n", data); + else if (u->last_section_private > 0) + data = strjoina("\n[Unit]\n", data); + } + + if (u->transient_file) { + /* When this is a transient unit file in creation, then let's not create a new drop-in but instead + * write to the transient unit file. */ + fputs(data, u->transient_file); + + if (!endswith(data, "\n")) + fputc('\n', u->transient_file); + + /* Remember which section we wrote this entry to */ + u->last_section_private = !!(flags & UNIT_PRIVATE); + return 0; + } + + dir = unit_drop_in_dir(u, flags); + if (!dir) + return -EINVAL; + + wrapped = strjoina("# This is a drop-in unit file extension, created via \"systemctl set-property\"\n" + "# or an equivalent operation. Do not edit.\n", + data, + "\n"); + + r = drop_in_file(dir, u->id, 50, name, &p, &q); + if (r < 0) + return r; + + (void) mkdir_p_label(p, 0755); + r = write_string_file_atomic_label(q, wrapped); + if (r < 0) + return r; + + r = strv_push(&u->dropin_paths, q); + if (r < 0) + return r; + q = NULL; + + strv_uniq(u->dropin_paths); + + u->dropin_mtime = now(CLOCK_REALTIME); + + return 0; +} + +int unit_write_settingf(Unit *u, UnitWriteFlags flags, const char *name, const char *format, ...) { + _cleanup_free_ char *p = NULL; + va_list ap; + int r; + + assert(u); + assert(name); + assert(format); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return 0; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return unit_write_setting(u, flags, name, p); +} + +int unit_make_transient(Unit *u) { + _cleanup_free_ char *path = NULL; + FILE *f; + + assert(u); + + if (!UNIT_VTABLE(u)->can_transient) + return -EOPNOTSUPP; + + (void) mkdir_p_label(u->manager->lookup_paths.transient, 0755); + + path = strjoin(u->manager->lookup_paths.transient, "/", u->id); + if (!path) + return -ENOMEM; + + /* Let's open the file we'll write the transient settings into. This file is kept open as long as we are + * creating the transient, and is closed in unit_load(), as soon as we start loading the file. */ + + RUN_WITH_UMASK(0022) { + f = fopen(path, "we"); + if (!f) + return -errno; + } + + safe_fclose(u->transient_file); + u->transient_file = f; + + free_and_replace(u->fragment_path, path); + + u->source_path = mfree(u->source_path); + u->dropin_paths = strv_free(u->dropin_paths); + u->fragment_mtime = u->source_mtime = u->dropin_mtime = 0; + + u->load_state = UNIT_STUB; + u->load_error = 0; + u->transient = true; + + unit_add_to_dbus_queue(u); + unit_add_to_gc_queue(u); + + fputs("# This is a transient unit file, created programmatically via the systemd API. Do not edit.\n", + u->transient_file); + + return 0; +} + +static void log_kill(pid_t pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + /* Don't log about processes marked with brackets, under the assumption that these are temporary processes + only, like for example systemd's own PAM stub process. */ + if (comm && comm[0] == '(') + return; + + log_unit_notice(userdata, + "Killing process " PID_FMT " (%s) with signal SIG%s.", + pid, + strna(comm), + signal_to_string(sig)); +} + +static int operation_to_signal(KillContext *c, KillOperation k) { + assert(c); + + switch (k) { + + case KILL_TERMINATE: + case KILL_TERMINATE_AND_LOG: + return c->kill_signal; + + case KILL_KILL: + return c->final_kill_signal; + + case KILL_WATCHDOG: + return c->watchdog_signal; + + default: + assert_not_reached("KillOperation unknown"); + } +} + +int unit_kill_context( + Unit *u, + KillContext *c, + KillOperation k, + pid_t main_pid, + pid_t control_pid, + bool main_pid_alien) { + + bool wait_for_exit = false, send_sighup; + cg_kill_log_func_t log_func = NULL; + int sig, r; + + assert(u); + assert(c); + + /* Kill the processes belonging to this unit, in preparation for shutting the unit down. + * Returns > 0 if we killed something worth waiting for, 0 otherwise. */ + + if (c->kill_mode == KILL_NONE) + return 0; + + sig = operation_to_signal(c, k); + + send_sighup = + c->send_sighup && + IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) && + sig != SIGHUP; + + if (k != KILL_TERMINATE || IN_SET(sig, SIGKILL, SIGABRT)) + log_func = log_kill; + + if (main_pid > 0) { + if (log_func) + log_func(main_pid, sig, u); + + r = kill_and_sigcont(main_pid, sig); + if (r < 0 && r != -ESRCH) { + _cleanup_free_ char *comm = NULL; + (void) get_process_comm(main_pid, &comm); + + log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid, strna(comm)); + } else { + if (!main_pid_alien) + wait_for_exit = true; + + if (r != -ESRCH && send_sighup) + (void) kill(main_pid, SIGHUP); + } + } + + if (control_pid > 0) { + if (log_func) + log_func(control_pid, sig, u); + + r = kill_and_sigcont(control_pid, sig); + if (r < 0 && r != -ESRCH) { + _cleanup_free_ char *comm = NULL; + (void) get_process_comm(control_pid, &comm); + + log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid, strna(comm)); + } else { + wait_for_exit = true; + + if (r != -ESRCH && send_sighup) + (void) kill(control_pid, SIGHUP); + } + } + + if (u->cgroup_path && + (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) { + _cleanup_set_free_ Set *pid_set = NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + pid_set = unit_pid_set(main_pid, control_pid); + if (!pid_set) + return -ENOMEM; + + r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + sig, + CGROUP_SIGCONT|CGROUP_IGNORE_SELF, + pid_set, + log_func, u); + if (r < 0) { + if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT)) + log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", u->cgroup_path); + + } else if (r > 0) { + + /* FIXME: For now, on the legacy hierarchy, we will not wait for the cgroup members to die if + * we are running in a container or if this is a delegation unit, simply because cgroup + * notification is unreliable in these cases. It doesn't work at all in containers, and outside + * of containers it can be confused easily by left-over directories in the cgroup — which + * however should not exist in non-delegated units. On the unified hierarchy that's different, + * there we get proper events. Hence rely on them. */ + + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0 || + (detect_container() == 0 && !unit_cgroup_delegate(u))) + wait_for_exit = true; + + if (send_sighup) { + set_free(pid_set); + + pid_set = unit_pid_set(main_pid, control_pid); + if (!pid_set) + return -ENOMEM; + + cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + SIGHUP, + CGROUP_IGNORE_SELF, + pid_set, + NULL, NULL); + } + } + } + + return wait_for_exit; +} + +int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) { + _cleanup_free_ char *p = NULL; + UnitDependencyInfo di; + int r; + + assert(u); + assert(path); + + /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these paths in + * the unit (from the path to the UnitDependencyInfo structure indicating how to the dependency came to + * be). However, we build a prefix table for all possible prefixes so that new appearing mount units can easily + * determine which units to make themselves a dependency of. */ + + if (!path_is_absolute(path)) + return -EINVAL; + + r = hashmap_ensure_allocated(&u->requires_mounts_for, &path_hash_ops); + if (r < 0) + return r; + + p = strdup(path); + if (!p) + return -ENOMEM; + + path = path_simplify(p, false); + + if (!path_is_normalized(path)) + return -EPERM; + + if (hashmap_contains(u->requires_mounts_for, path)) + return 0; + + di = (UnitDependencyInfo) { + .origin_mask = mask + }; + + r = hashmap_put(u->requires_mounts_for, path, di.data); + if (r < 0) + return r; + p = NULL; + + char prefix[strlen(path) + 1]; + PATH_FOREACH_PREFIX_MORE(prefix, path) { + Set *x; + + x = hashmap_get(u->manager->units_requiring_mounts_for, prefix); + if (!x) { + _cleanup_free_ char *q = NULL; + + r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops); + if (r < 0) + return r; + + q = strdup(prefix); + if (!q) + return -ENOMEM; + + x = set_new(NULL); + if (!x) + return -ENOMEM; + + r = hashmap_put(u->manager->units_requiring_mounts_for, q, x); + if (r < 0) { + set_free(x); + return r; + } + q = NULL; + } + + r = set_put(x, u); + if (r < 0) + return r; + } + + return 0; +} + +int unit_setup_exec_runtime(Unit *u) { + ExecRuntime **rt; + size_t offset; + Unit *other; + Iterator i; + void *v; + int r; + + offset = UNIT_VTABLE(u)->exec_runtime_offset; + assert(offset > 0); + + /* Check if there already is an ExecRuntime for this unit? */ + rt = (ExecRuntime**) ((uint8_t*) u + offset); + if (*rt) + return 0; + + /* Try to get it from somebody else */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_JOINS_NAMESPACE_OF], i) { + r = exec_runtime_acquire(u->manager, NULL, other->id, false, rt); + if (r == 1) + return 1; + } + + return exec_runtime_acquire(u->manager, unit_get_exec_context(u), u->id, true, rt); +} + +int unit_setup_dynamic_creds(Unit *u) { + ExecContext *ec; + DynamicCreds *dcreds; + size_t offset; + + assert(u); + + offset = UNIT_VTABLE(u)->dynamic_creds_offset; + assert(offset > 0); + dcreds = (DynamicCreds*) ((uint8_t*) u + offset); + + ec = unit_get_exec_context(u); + assert(ec); + + if (!ec->dynamic_user) + return 0; + + return dynamic_creds_acquire(dcreds, u->manager, ec->user, ec->group); +} + +bool unit_type_supported(UnitType t) { + if (_unlikely_(t < 0)) + return false; + if (_unlikely_(t >= _UNIT_TYPE_MAX)) + return false; + + if (!unit_vtable[t]->supported) + return true; + + return unit_vtable[t]->supported(); +} + +void unit_warn_if_dir_nonempty(Unit *u, const char* where) { + int r; + + assert(u); + assert(where); + + r = dir_is_empty(where); + if (r > 0 || r == -ENOTDIR) + return; + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to check directory %s: %m", where); + return; + } + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where), + "WHERE=%s", where); +} + +int unit_fail_if_noncanonical(Unit *u, const char* where) { + _cleanup_free_ char *canonical_where = NULL; + int r; + + assert(u); + assert(where); + + r = chase_symlinks(where, NULL, CHASE_NONEXISTENT, &canonical_where); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to check %s for symlinks, ignoring: %m", where); + return 0; + } + + /* We will happily ignore a trailing slash (or any redundant slashes) */ + if (path_equal(where, canonical_where)) + return 0; + + /* No need to mention "." or "..", they would already have been rejected by unit_name_from_path() */ + log_struct(LOG_ERR, + "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Mount path %s is not canonical (contains a symlink).", where), + "WHERE=%s", where); + + return -ELOOP; +} + +bool unit_is_pristine(Unit *u) { + assert(u); + + /* Check if the unit already exists or is already around, + * in a number of different ways. Note that to cater for unit + * types such as slice, we are generally fine with units that + * are marked UNIT_LOADED even though nothing was actually + * loaded, as those unit types don't require a file on disk. */ + + return !(!IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_LOADED) || + u->fragment_path || + u->source_path || + !strv_isempty(u->dropin_paths) || + u->job || + u->merged_into); +} + +pid_t unit_control_pid(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->control_pid) + return UNIT_VTABLE(u)->control_pid(u); + + return 0; +} + +pid_t unit_main_pid(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->main_pid) + return UNIT_VTABLE(u)->main_pid(u); + + return 0; +} + +static void unit_unref_uid_internal( + Unit *u, + uid_t *ref_uid, + bool destroy_now, + void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) { + + assert(u); + assert(ref_uid); + assert(_manager_unref_uid); + + /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and + * gid_t are actually the same time, with the same validity rules. + * + * Drops a reference to UID/GID from a unit. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (!uid_is_valid(*ref_uid)) + return; + + _manager_unref_uid(u->manager, *ref_uid, destroy_now); + *ref_uid = UID_INVALID; +} + +void unit_unref_uid(Unit *u, bool destroy_now) { + unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid); +} + +void unit_unref_gid(Unit *u, bool destroy_now) { + unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid); +} + +static int unit_ref_uid_internal( + Unit *u, + uid_t *ref_uid, + uid_t uid, + bool clean_ipc, + int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) { + + int r; + + assert(u); + assert(ref_uid); + assert(uid_is_valid(uid)); + assert(_manager_ref_uid); + + /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t + * are actually the same type, and have the same validity rules. + * + * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a + * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter + * drops to zero. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (*ref_uid == uid) + return 0; + + if (uid_is_valid(*ref_uid)) /* Already set? */ + return -EBUSY; + + r = _manager_ref_uid(u->manager, uid, clean_ipc); + if (r < 0) + return r; + + *ref_uid = uid; + return 1; +} + +int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) { + return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid); +} + +int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) { + return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid); +} + +static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) { + int r = 0, q = 0; + + assert(u); + + /* Reference both a UID and a GID in one go. Either references both, or neither. */ + + if (uid_is_valid(uid)) { + r = unit_ref_uid(u, uid, clean_ipc); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + q = unit_ref_gid(u, gid, clean_ipc); + if (q < 0) { + if (r > 0) + unit_unref_uid(u, false); + + return q; + } + } + + return r > 0 || q > 0; +} + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) { + ExecContext *c; + int r; + + assert(u); + + c = unit_get_exec_context(u); + + r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false); + if (r < 0) + return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m"); + + return r; +} + +void unit_unref_uid_gid(Unit *u, bool destroy_now) { + assert(u); + + unit_unref_uid(u, destroy_now); + unit_unref_gid(u, destroy_now); +} + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) { + int r; + + assert(u); + + /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names + * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC + * objects when no service references the UID/GID anymore. */ + + r = unit_ref_uid_gid(u, uid, gid); + if (r > 0) + unit_add_to_dbus_queue(u); +} + +int unit_set_invocation_id(Unit *u, sd_id128_t id) { + int r; + + assert(u); + + /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */ + + if (sd_id128_equal(u->invocation_id, id)) + return 0; + + if (!sd_id128_is_null(u->invocation_id)) + (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (sd_id128_is_null(id)) { + r = 0; + goto reset; + } + + r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops); + if (r < 0) + goto reset; + + u->invocation_id = id; + sd_id128_to_string(id, u->invocation_id_string); + + r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u); + if (r < 0) + goto reset; + + return 0; + +reset: + u->invocation_id = SD_ID128_NULL; + u->invocation_id_string[0] = 0; + return r; +} + +int unit_acquire_invocation_id(Unit *u) { + sd_id128_t id; + int r; + + assert(u); + + r = sd_id128_randomize(&id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m"); + + r = unit_set_invocation_id(u, id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m"); + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_set_exec_params(Unit *u, ExecParameters *p) { + int r; + + assert(u); + assert(p); + + /* Copy parameters from manager */ + r = manager_get_effective_environment(u->manager, &p->environment); + if (r < 0) + return r; + + p->confirm_spawn = manager_get_confirm_spawn(u->manager); + p->cgroup_supported = u->manager->cgroup_supported; + p->prefix = u->manager->prefix; + SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager)); + + /* Copy paramaters from unit */ + p->cgroup_path = u->cgroup_path; + SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u)); + + return 0; +} + +int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret) { + int r; + + assert(u); + assert(ret); + + /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child, + * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */ + + (void) unit_realize_cgroup(u); + + r = safe_fork(name, FORK_REOPEN_LOG, ret); + if (r != 0) + return r; + + (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE, -1); + (void) ignore_signals(SIGPIPE, -1); + + (void) prctl(PR_SET_PDEATHSIG, SIGTERM); + + if (u->cgroup_path) { + r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", u->cgroup_path); + _exit(EXIT_CGROUP); + } + } + + return 0; +} + +static void unit_update_dependency_mask(Unit *u, UnitDependency d, Unit *other, UnitDependencyInfo di) { + assert(u); + assert(d >= 0); + assert(d < _UNIT_DEPENDENCY_MAX); + assert(other); + + if (di.origin_mask == 0 && di.destination_mask == 0) { + /* No bit set anymore, let's drop the whole entry */ + assert_se(hashmap_remove(u->dependencies[d], other)); + log_unit_debug(u, "%s lost dependency %s=%s", u->id, unit_dependency_to_string(d), other->id); + } else + /* Mask was reduced, let's update the entry */ + assert_se(hashmap_update(u->dependencies[d], other, di.data) == 0); +} + +void unit_remove_dependencies(Unit *u, UnitDependencyMask mask) { + UnitDependency d; + + assert(u); + + /* Removes all dependencies u has on other units marked for ownership by 'mask'. */ + + if (mask == 0) + return; + + for (d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + bool done; + + do { + UnitDependencyInfo di; + Unit *other; + Iterator i; + + done = true; + + HASHMAP_FOREACH_KEY(di.data, other, u->dependencies[d], i) { + UnitDependency q; + + if ((di.origin_mask & ~mask) == di.origin_mask) + continue; + di.origin_mask &= ~mask; + unit_update_dependency_mask(u, d, other, di); + + /* We updated the dependency from our unit to the other unit now. But most dependencies + * imply a reverse dependency. Hence, let's delete that one too. For that we go through + * all dependency types on the other unit and delete all those which point to us and + * have the right mask set. */ + + for (q = 0; q < _UNIT_DEPENDENCY_MAX; q++) { + UnitDependencyInfo dj; + + dj.data = hashmap_get(other->dependencies[q], u); + if ((dj.destination_mask & ~mask) == dj.destination_mask) + continue; + dj.destination_mask &= ~mask; + + unit_update_dependency_mask(other, q, u, dj); + } + + unit_add_to_gc_queue(other); + + done = false; + break; + } + + } while (!done); + } +} + +static int unit_export_invocation_id(Unit *u) { + const char *p; + int r; + + assert(u); + + if (u->exported_invocation_id) + return 0; + + if (sd_id128_is_null(u->invocation_id)) + return 0; + + p = strjoina("/run/systemd/units/invocation:", u->id); + r = symlink_atomic(u->invocation_id_string, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create invocation ID symlink %s: %m", p); + + u->exported_invocation_id = true; + return 0; +} + +static int unit_export_log_level_max(Unit *u, const ExecContext *c) { + const char *p; + char buf[2]; + int r; + + assert(u); + assert(c); + + if (u->exported_log_level_max) + return 0; + + if (c->log_level_max < 0) + return 0; + + assert(c->log_level_max <= 7); + + buf[0] = '0' + c->log_level_max; + buf[1] = 0; + + p = strjoina("/run/systemd/units/log-level-max:", u->id); + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create maximum log level symlink %s: %m", p); + + u->exported_log_level_max = true; + return 0; +} + +static int unit_export_log_extra_fields(Unit *u, const ExecContext *c) { + _cleanup_close_ int fd = -1; + struct iovec *iovec; + const char *p; + char *pattern; + le64_t *sizes; + ssize_t n; + size_t i; + int r; + + if (u->exported_log_extra_fields) + return 0; + + if (c->n_log_extra_fields <= 0) + return 0; + + sizes = newa(le64_t, c->n_log_extra_fields); + iovec = newa(struct iovec, c->n_log_extra_fields * 2); + + for (i = 0; i < c->n_log_extra_fields; i++) { + sizes[i] = htole64(c->log_extra_fields[i].iov_len); + + iovec[i*2] = IOVEC_MAKE(sizes + i, sizeof(le64_t)); + iovec[i*2+1] = c->log_extra_fields[i]; + } + + p = strjoina("/run/systemd/units/log-extra-fields:", u->id); + pattern = strjoina(p, ".XXXXXX"); + + fd = mkostemp_safe(pattern); + if (fd < 0) + return log_unit_debug_errno(u, fd, "Failed to create extra fields file %s: %m", p); + + n = writev(fd, iovec, c->n_log_extra_fields*2); + if (n < 0) { + r = log_unit_debug_errno(u, errno, "Failed to write extra fields: %m"); + goto fail; + } + + (void) fchmod(fd, 0644); + + if (rename(pattern, p) < 0) { + r = log_unit_debug_errno(u, errno, "Failed to rename extra fields file: %m"); + goto fail; + } + + u->exported_log_extra_fields = true; + return 0; + +fail: + (void) unlink(pattern); + return r; +} + +static int unit_export_log_rate_limit_interval(Unit *u, const ExecContext *c) { + _cleanup_free_ char *buf = NULL; + const char *p; + int r; + + assert(u); + assert(c); + + if (u->exported_log_rate_limit_interval) + return 0; + + if (c->log_rate_limit_interval_usec == 0) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id); + + if (asprintf(&buf, "%" PRIu64, c->log_rate_limit_interval_usec) < 0) + return log_oom(); + + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create log rate limit interval symlink %s: %m", p); + + u->exported_log_rate_limit_interval = true; + return 0; +} + +static int unit_export_log_rate_limit_burst(Unit *u, const ExecContext *c) { + _cleanup_free_ char *buf = NULL; + const char *p; + int r; + + assert(u); + assert(c); + + if (u->exported_log_rate_limit_burst) + return 0; + + if (c->log_rate_limit_burst == 0) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id); + + if (asprintf(&buf, "%u", c->log_rate_limit_burst) < 0) + return log_oom(); + + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create log rate limit burst symlink %s: %m", p); + + u->exported_log_rate_limit_burst = true; + return 0; +} + +void unit_export_state_files(Unit *u) { + const ExecContext *c; + + assert(u); + + if (!u->id) + return; + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (MANAGER_IS_TEST_RUN(u->manager)) + return; + + /* Exports a couple of unit properties to /run/systemd/units/, so that journald can quickly query this data + * from there. Ideally, journald would use IPC to query this, like everybody else, but that's hard, as long as + * the IPC system itself and PID 1 also log to the journal. + * + * Note that these files really shouldn't be considered API for anyone else, as use a runtime file system as + * IPC replacement is not compatible with today's world of file system namespaces. However, this doesn't really + * apply to communication between the journal and systemd, as we assume that these two daemons live in the same + * namespace at least. + * + * Note that some of the "files" exported here are actually symlinks and not regular files. Symlinks work + * better for storing small bits of data, in particular as we can write them with two system calls, and read + * them with one. */ + + (void) unit_export_invocation_id(u); + + c = unit_get_exec_context(u); + if (c) { + (void) unit_export_log_level_max(u, c); + (void) unit_export_log_extra_fields(u, c); + (void) unit_export_log_rate_limit_interval(u, c); + (void) unit_export_log_rate_limit_burst(u, c); + } +} + +void unit_unlink_state_files(Unit *u) { + const char *p; + + assert(u); + + if (!u->id) + return; + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + /* Undoes the effect of unit_export_state() */ + + if (u->exported_invocation_id) { + p = strjoina("/run/systemd/units/invocation:", u->id); + (void) unlink(p); + + u->exported_invocation_id = false; + } + + if (u->exported_log_level_max) { + p = strjoina("/run/systemd/units/log-level-max:", u->id); + (void) unlink(p); + + u->exported_log_level_max = false; + } + + if (u->exported_log_extra_fields) { + p = strjoina("/run/systemd/units/extra-fields:", u->id); + (void) unlink(p); + + u->exported_log_extra_fields = false; + } + + if (u->exported_log_rate_limit_interval) { + p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id); + (void) unlink(p); + + u->exported_log_rate_limit_interval = false; + } + + if (u->exported_log_rate_limit_burst) { + p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id); + (void) unlink(p); + + u->exported_log_rate_limit_burst = false; + } +} + +int unit_prepare_exec(Unit *u) { + int r; + + assert(u); + + /* Prepares everything so that we can fork of a process for this unit */ + + (void) unit_realize_cgroup(u); + + if (u->reset_accounting) { + (void) unit_reset_cpu_accounting(u); + (void) unit_reset_ip_accounting(u); + u->reset_accounting = false; + } + + unit_export_state_files(u); + + r = unit_setup_exec_runtime(u); + if (r < 0) + return r; + + r = unit_setup_dynamic_creds(u); + if (r < 0) + return r; + + return 0; +} + +static void log_leftover(pid_t pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + if (comm && comm[0] == '(') /* Most likely our own helper process (PAM?), ignore */ + return; + + log_unit_warning(userdata, + "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n" + "This usually indicates unclean termination of a previous run, or service implementation deficiencies.", + pid, strna(comm)); +} + +void unit_warn_leftover_processes(Unit *u) { + assert(u); + + (void) unit_pick_cgroup_path(u); + + if (!u->cgroup_path) + return; + + (void) cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, 0, 0, NULL, log_leftover, u); +} + +bool unit_needs_console(Unit *u) { + ExecContext *ec; + UnitActiveState state; + + assert(u); + + state = unit_active_state(u); + + if (UNIT_IS_INACTIVE_OR_FAILED(state)) + return false; + + if (UNIT_VTABLE(u)->needs_console) + return UNIT_VTABLE(u)->needs_console(u); + + /* If this unit type doesn't implement this call, let's use a generic fallback implementation: */ + ec = unit_get_exec_context(u); + if (!ec) + return false; + + return exec_context_may_touch_console(ec); +} + +const char *unit_label_path(Unit *u) { + const char *p; + + /* Returns the file system path to use for MAC access decisions, i.e. the file to read the SELinux label off + * when validating access checks. */ + + p = u->source_path ?: u->fragment_path; + if (!p) + return NULL; + + /* If a unit is masked, then don't read the SELinux label of /dev/null, as that really makes no sense */ + if (path_equal(p, "/dev/null")) + return NULL; + + return p; +} + +int unit_pid_attachable(Unit *u, pid_t pid, sd_bus_error *error) { + int r; + + assert(u); + + /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself, + * and not a kernel thread either */ + + /* First, a simple range check */ + if (!pid_is_valid(pid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier " PID_FMT " is not valid.", pid); + + /* Some extra safety check */ + if (pid == 1 || pid == getpid_cached()) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager process, refusing.", pid); + + /* Don't even begin to bother with kernel threads */ + r = is_kernel_thread(pid); + if (r == -ESRCH) + return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid); + if (r > 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid); + + return 0; +} + +void unit_log_success(Unit *u) { + assert(u); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_SUCCESS_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Succeeded.")); +} + +void unit_log_failure(Unit *u, const char *result) { + assert(u); + assert(result); + + log_struct(LOG_WARNING, + "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILURE_RESULT_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Failed with result '%s'.", result), + "UNIT_RESULT=%s", result); +} + +void unit_log_process_exit( + Unit *u, + int level, + const char *kind, + const char *command, + int code, + int status) { + + assert(u); + assert(kind); + + if (code != CLD_EXITED) + level = LOG_WARNING; + + log_struct(level, + "MESSAGE_ID=" SD_MESSAGE_UNIT_PROCESS_EXIT_STR, + LOG_UNIT_MESSAGE(u, "%s exited, code=%s, status=%i/%s", + kind, + sigchld_code_to_string(code), status, + strna(code == CLD_EXITED + ? exit_status_to_string(status, EXIT_STATUS_FULL) + : signal_to_string(status))), + "EXIT_CODE=%s", sigchld_code_to_string(code), + "EXIT_STATUS=%i", status, + "COMMAND=%s", strna(command), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u)); +} + +int unit_exit_status(Unit *u) { + assert(u); + + /* Returns the exit status to propagate for the most recent cycle of this unit. Returns a value in the range + * 0…255 if there's something to propagate. EOPNOTSUPP if the concept does not apply to this unit type, ENODATA + * if no data is currently known (for example because the unit hasn't deactivated yet) and EBADE if the main + * service process has exited abnormally (signal/coredump). */ + + if (!UNIT_VTABLE(u)->exit_status) + return -EOPNOTSUPP; + + return UNIT_VTABLE(u)->exit_status(u); +} + +int unit_failure_action_exit_status(Unit *u) { + int r; + + assert(u); + + /* Returns the exit status to propagate on failure, or an error if there's nothing to propagate */ + + if (u->failure_action_exit_status >= 0) + return u->failure_action_exit_status; + + r = unit_exit_status(u); + if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */ + return 255; + + return r; +} + +int unit_success_action_exit_status(Unit *u) { + int r; + + assert(u); + + /* Returns the exit status to propagate on success, or an error if there's nothing to propagate */ + + if (u->success_action_exit_status >= 0) + return u->success_action_exit_status; + + r = unit_exit_status(u); + if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */ + return 255; + + return r; +} + +static const char* const collect_mode_table[_COLLECT_MODE_MAX] = { + [COLLECT_INACTIVE] = "inactive", + [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed", +}; + +DEFINE_STRING_TABLE_LOOKUP(collect_mode, CollectMode); diff --git a/src/core/unit.h b/src/core/unit.h new file mode 100644 index 0000000..43cf157 --- /dev/null +++ b/src/core/unit.h @@ -0,0 +1,856 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <stdbool.h> +#include <stdlib.h> +#include <unistd.h> + +#include "bpf-program.h" +#include "condition.h" +#include "emergency-action.h" +#include "install.h" +#include "list.h" +#include "unit-name.h" +#include "cgroup.h" + +typedef struct UnitRef UnitRef; + +typedef enum KillOperation { + KILL_TERMINATE, + KILL_TERMINATE_AND_LOG, + KILL_KILL, + KILL_WATCHDOG, + _KILL_OPERATION_MAX, + _KILL_OPERATION_INVALID = -1 +} KillOperation; + +typedef enum CollectMode { + COLLECT_INACTIVE, + COLLECT_INACTIVE_OR_FAILED, + _COLLECT_MODE_MAX, + _COLLECT_MODE_INVALID = -1, +} CollectMode; + +static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) { + return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING); +} + +static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) { + return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING); +} + +static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) { + return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING); +} + +static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) { + return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED); +} + +/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We + * use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be + * created as a result of multiple "reasons", hence the bitmask. */ +typedef enum UnitDependencyMask { + /* Configured directly by the unit file, .wants/.requries symlink or drop-in, or as an immediate result of a + * non-dependency option configured that way. */ + UNIT_DEPENDENCY_FILE = 1 << 0, + + /* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and + * type) */ + UNIT_DEPENDENCY_IMPLICIT = 1 << 1, + + /* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually + * just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only + * be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */ + UNIT_DEPENDENCY_DEFAULT = 1 << 2, + + /* A dependency created from udev rules */ + UNIT_DEPENDENCY_UDEV = 1 << 3, + + /* A dependency created because of some unit's RequiresMountsFor= setting */ + UNIT_DEPENDENCY_PATH = 1 << 4, + + /* A dependency created because of data read from /proc/self/mountinfo and no other configuration source */ + UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT = 1 << 5, + + /* A dependency created because of data read from /proc/self/mountinfo, but conditionalized by + * DefaultDependencies= and thus also involving configuration from UNIT_DEPENDENCY_FILE sources */ + UNIT_DEPENDENCY_MOUNTINFO_DEFAULT = 1 << 6, + + /* A dependency created because of data read from /proc/swaps and no other configuration source */ + UNIT_DEPENDENCY_PROC_SWAP = 1 << 7, + + _UNIT_DEPENDENCY_MASK_FULL = (1 << 8) - 1, +} UnitDependencyMask; + +/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can + * be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin + * and the destination of a dependency might have created it. */ +typedef union UnitDependencyInfo { + void *data; + struct { + UnitDependencyMask origin_mask:16; + UnitDependencyMask destination_mask:16; + } _packed_; +} UnitDependencyInfo; + +#include "job.h" + +struct UnitRef { + /* Keeps tracks of references to a unit. This is useful so + * that we can merge two units if necessary and correct all + * references to them */ + + Unit *source, *target; + LIST_FIELDS(UnitRef, refs_by_target); +}; + +typedef struct Unit { + Manager *manager; + + UnitType type; + UnitLoadState load_state; + Unit *merged_into; + + char *id; /* One name is special because we use it for identification. Points to an entry in the names set */ + char *instance; + + Set *names; + + /* For each dependency type we maintain a Hashmap whose key is the Unit* object, and the value encodes why the + * dependency exists, using the UnitDependencyInfo type */ + Hashmap *dependencies[_UNIT_DEPENDENCY_MAX]; + + /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the UnitDependencyInfo type */ + Hashmap *requires_mounts_for; + + char *description; + char **documentation; + + char *fragment_path; /* if loaded from a config file this is the primary path to it */ + char *source_path; /* if converted, the source file */ + char **dropin_paths; + + usec_t fragment_mtime; + usec_t source_mtime; + usec_t dropin_mtime; + + /* If this is a transient unit we are currently writing, this is where we are writing it to */ + FILE *transient_file; + + /* If there is something to do with this unit, then this is the installed job for it */ + Job *job; + + /* JOB_NOP jobs are special and can be installed without disturbing the real job. */ + Job *nop_job; + + /* The slot used for watching NameOwnerChanged signals */ + sd_bus_slot *match_bus_slot; + + /* References to this unit from clients */ + sd_bus_track *bus_track; + char **deserialized_refs; + + /* Job timeout and action to take */ + usec_t job_timeout; + usec_t job_running_timeout; + bool job_running_timeout_set:1; + EmergencyAction job_timeout_action; + char *job_timeout_reboot_arg; + + /* References to this */ + LIST_HEAD(UnitRef, refs_by_target); + + /* Conditions to check */ + LIST_HEAD(Condition, conditions); + LIST_HEAD(Condition, asserts); + + dual_timestamp condition_timestamp; + dual_timestamp assert_timestamp; + + /* Updated whenever the low-level state changes */ + dual_timestamp state_change_timestamp; + + /* Updated whenever the (high-level) active state enters or leaves the active or inactive states */ + dual_timestamp inactive_exit_timestamp; + dual_timestamp active_enter_timestamp; + dual_timestamp active_exit_timestamp; + dual_timestamp inactive_enter_timestamp; + + UnitRef slice; + + /* Per type list */ + LIST_FIELDS(Unit, units_by_type); + + /* Load queue */ + LIST_FIELDS(Unit, load_queue); + + /* D-Bus queue */ + LIST_FIELDS(Unit, dbus_queue); + + /* Cleanup queue */ + LIST_FIELDS(Unit, cleanup_queue); + + /* GC queue */ + LIST_FIELDS(Unit, gc_queue); + + /* CGroup realize members queue */ + LIST_FIELDS(Unit, cgroup_realize_queue); + + /* cgroup empty queue */ + LIST_FIELDS(Unit, cgroup_empty_queue); + + /* Target dependencies queue */ + LIST_FIELDS(Unit, target_deps_queue); + + /* Queue of units with StopWhenUnneeded set that shell be checked for clean-up. */ + LIST_FIELDS(Unit, stop_when_unneeded_queue); + + /* PIDs we keep an eye on. Note that a unit might have many + * more, but these are the ones we care enough about to + * process SIGCHLD for */ + Set *pids; + + /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; + + /* Used during GC sweeps */ + unsigned gc_marker; + + /* Error code when we didn't manage to load the unit (negative) */ + int load_error; + + /* Put a ratelimit on unit starting */ + RateLimit start_limit; + EmergencyAction start_limit_action; + + /* What to do on failure or success */ + EmergencyAction success_action, failure_action; + int success_action_exit_status, failure_action_exit_status; + char *reboot_arg; + + /* Make sure we never enter endless loops with the check unneeded logic, or the BindsTo= logic */ + RateLimit auto_stop_ratelimit; + + /* Reference to a specific UID/GID */ + uid_t ref_uid; + gid_t ref_gid; + + /* Cached unit file state and preset */ + UnitFileState unit_file_state; + int unit_file_preset; + + /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */ + nsec_t cpu_usage_base; + nsec_t cpu_usage_last; /* the most recently read value */ + + /* Counterparts in the cgroup filesystem */ + char *cgroup_path; + CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ + CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ + CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */ + CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ + int cgroup_inotify_wd; + + /* Device Controller BPF program */ + BPFProgram *bpf_device_control_installed; + + /* IP BPF Firewalling/accounting */ + int ip_accounting_ingress_map_fd; + int ip_accounting_egress_map_fd; + + int ipv4_allow_map_fd; + int ipv6_allow_map_fd; + int ipv4_deny_map_fd; + int ipv6_deny_map_fd; + + BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed; + BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed; + + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + + /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new + * ones which might have appeared. */ + sd_event_source *rewatch_pids_event_source; + + /* How to start OnFailure units */ + JobMode on_failure_job_mode; + + /* Tweaking the GC logic */ + CollectMode collect_mode; + + /* The current invocation ID */ + sd_id128_t invocation_id; + char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */ + + /* Garbage collect us we nobody wants or requires us anymore */ + bool stop_when_unneeded; + + /* Create default dependencies */ + bool default_dependencies; + + /* Refuse manual starting, allow starting only indirectly via dependency. */ + bool refuse_manual_start; + + /* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */ + bool refuse_manual_stop; + + /* Allow isolation requests */ + bool allow_isolate; + + /* Ignore this unit when isolating */ + bool ignore_on_isolate; + + /* Did the last condition check succeed? */ + bool condition_result; + bool assert_result; + + /* Is this a transient unit? */ + bool transient; + + /* Is this a unit that is always running and cannot be stopped? */ + bool perpetual; + + /* Booleans indicating membership of this unit in the various queues */ + bool in_load_queue:1; + bool in_dbus_queue:1; + bool in_cleanup_queue:1; + bool in_gc_queue:1; + bool in_cgroup_realize_queue:1; + bool in_cgroup_empty_queue:1; + bool in_target_deps_queue:1; + bool in_stop_when_unneeded_queue:1; + + bool sent_dbus_new_signal:1; + + bool in_audit:1; + bool on_console:1; + + bool cgroup_realized:1; + bool cgroup_members_mask_valid:1; + + /* Reset cgroup accounting next time we fork something off */ + bool reset_accounting:1; + + bool start_limit_hit:1; + + /* Did we already invoke unit_coldplug() for this unit? */ + bool coldplugged:1; + + /* For transient units: whether to add a bus track reference after creating the unit */ + bool bus_track_add:1; + + /* Remember which unit state files we created */ + bool exported_invocation_id:1; + bool exported_log_level_max:1; + bool exported_log_extra_fields:1; + bool exported_log_rate_limit_interval:1; + bool exported_log_rate_limit_burst:1; + + /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If + * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */ + signed int last_section_private:2; +} Unit; + +typedef struct UnitStatusMessageFormats { + const char *starting_stopping[2]; + const char *finished_start_job[_JOB_RESULT_MAX]; + const char *finished_stop_job[_JOB_RESULT_MAX]; +} UnitStatusMessageFormats; + +/* Flags used when writing drop-in files or transient unit files */ +typedef enum UnitWriteFlags { + /* Write a runtime unit file or drop-in (i.e. one below /run) */ + UNIT_RUNTIME = 1 << 0, + + /* Write a persistent drop-in (i.e. one below /etc) */ + UNIT_PERSISTENT = 1 << 1, + + /* Place this item in the per-unit-type private section, instead of [Unit] */ + UNIT_PRIVATE = 1 << 2, + + /* Apply specifier escaping before writing */ + UNIT_ESCAPE_SPECIFIERS = 1 << 3, + + /* Apply C escaping before writing */ + UNIT_ESCAPE_C = 1 << 4, +} UnitWriteFlags; + +/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */ +static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) { + return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0; +} + +#include "kill.h" + +typedef struct UnitVTable { + /* How much memory does an object of this unit type need */ + size_t object_size; + + /* If greater than 0, the offset into the object where + * ExecContext is found, if the unit type has that */ + size_t exec_context_offset; + + /* If greater than 0, the offset into the object where + * CGroupContext is found, if the unit type has that */ + size_t cgroup_context_offset; + + /* If greater than 0, the offset into the object where + * KillContext is found, if the unit type has that */ + size_t kill_context_offset; + + /* If greater than 0, the offset into the object where the + * pointer to ExecRuntime is found, if the unit type has + * that */ + size_t exec_runtime_offset; + + /* If greater than 0, the offset into the object where the pointer to DynamicCreds is found, if the unit type + * has that. */ + size_t dynamic_creds_offset; + + /* The name of the configuration file section with the private settings of this unit */ + const char *private_section; + + /* Config file sections this unit type understands, separated + * by NUL chars */ + const char *sections; + + /* This should reset all type-specific variables. This should + * not allocate memory, and is called with zero-initialized + * data. It should hence only initialize variables that need + * to be set != 0. */ + void (*init)(Unit *u); + + /* This should free all type-specific variables. It should be + * idempotent. */ + void (*done)(Unit *u); + + /* Actually load data from disk. This may fail, and should set + * load_state to UNIT_LOADED, UNIT_MERGED or leave it at + * UNIT_STUB if no configuration could be found. */ + int (*load)(Unit *u); + + /* During deserialization we only record the intended state to return to. With coldplug() we actually put the + * deserialized state in effect. This is where unit_notify() should be called to start things up. Note that + * this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the + * reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was + * in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's + * what catchup() below is for. */ + int (*coldplug)(Unit *u); + + /* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the + * reloading state. It's supposed to catch up with state changes due to external events we missed so far (for + * example because they took place while we were reloading/reexecing) */ + void (*catchup)(Unit *u); + + void (*dump)(Unit *u, FILE *f, const char *prefix); + + int (*start)(Unit *u); + int (*stop)(Unit *u); + int (*reload)(Unit *u); + + int (*kill)(Unit *u, KillWho w, int signo, sd_bus_error *error); + + bool (*can_reload)(Unit *u); + + /* Write all data that cannot be restored from other sources + * away using unit_serialize_item() */ + int (*serialize)(Unit *u, FILE *f, FDSet *fds); + + /* Restore one item from the serialization */ + int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds); + + /* Try to match up fds with what we need for this unit */ + void (*distribute_fds)(Unit *u, FDSet *fds); + + /* Boils down the more complex internal state of this unit to + * a simpler one that the engine can understand */ + UnitActiveState (*active_state)(Unit *u); + + /* Returns the substate specific to this unit type as + * string. This is purely information so that we can give the + * user a more fine grained explanation in which actual state a + * unit is in. */ + const char* (*sub_state_to_string)(Unit *u); + + /* Additionally to UnitActiveState determine whether unit is to be restarted. */ + bool (*will_restart)(Unit *u); + + /* Return false when there is a reason to prevent this unit from being gc'ed + * even though nothing references it and it isn't active in any way. */ + bool (*may_gc)(Unit *u); + + /* When the unit is not running and no job for it queued we shall release its runtime resources */ + void (*release_resources)(Unit *u); + + /* Invoked on every child that died */ + void (*sigchld_event)(Unit *u, pid_t pid, int code, int status); + + /* Reset failed state if we are in failed state */ + void (*reset_failed)(Unit *u); + + /* Called whenever any of the cgroups this unit watches for + * ran empty */ + void (*notify_cgroup_empty)(Unit *u); + + /* Called whenever a process of this unit sends us a message */ + void (*notify_message)(Unit *u, const struct ucred *ucred, char **tags, FDSet *fds); + + /* Called whenever a name this Unit registered for comes or goes away. */ + void (*bus_name_owner_change)(Unit *u, const char *name, const char *old_owner, const char *new_owner); + + /* Called for each property that is being set */ + int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + + /* Called after at least one property got changed to apply the necessary change */ + int (*bus_commit_properties)(Unit *u); + + /* Return the unit this unit is following */ + Unit *(*following)(Unit *u); + + /* Return the set of units that are following each other */ + int (*following_set)(Unit *u, Set **s); + + /* Invoked each time a unit this unit is triggering changes + * state or gains/loses a job */ + void (*trigger_notify)(Unit *u, Unit *trigger); + + /* Called whenever CLOCK_REALTIME made a jump */ + void (*time_change)(Unit *u); + + /* Called whenever /etc/localtime was modified */ + void (*timezone_change)(Unit *u); + + /* Returns the next timeout of a unit */ + int (*get_timeout)(Unit *u, usec_t *timeout); + + /* Returns the main PID if there is any defined, or 0. */ + pid_t (*main_pid)(Unit *u); + + /* Returns the main PID if there is any defined, or 0. */ + pid_t (*control_pid)(Unit *u); + + /* Returns true if the unit currently needs access to the console */ + bool (*needs_console)(Unit *u); + + /* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the + * exit code of the "main" process of the service or similar. */ + int (*exit_status)(Unit *u); + + /* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that + * unconditionally exist and are always active. The main reason to keep both enumeration functions separate is + * philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those + * discovered through regular enumeration should be put in place by catchup(), see below. */ + void (*enumerate_perpetual)(Manager *m); + + /* This is called for each unit type and should be used to enumerate units already existing in the system + * internally and load them. However, everything that is loaded here should still stay in inactive state. It is + * the job of the catchup() call above to put the units into the discovered state. */ + void (*enumerate)(Manager *m); + + /* Type specific cleanups. */ + void (*shutdown)(Manager *m); + + /* If this function is set and return false all jobs for units + * of this type will immediately fail. */ + bool (*supported)(void); + + /* The bus vtable */ + const sd_bus_vtable *bus_vtable; + + /* The strings to print in status messages */ + UnitStatusMessageFormats status_message_formats; + + /* True if transient units of this type are OK */ + bool can_transient:1; + + /* True if cgroup delegation is permissible */ + bool can_delegate:1; + + /* True if units of this type shall be startable only once and then never again */ + bool once_only:1; + + /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */ + bool gc_jobs:1; +} UnitVTable; + +extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX]; + +static inline const UnitVTable* UNIT_VTABLE(Unit *u) { + return unit_vtable[u->type]; +} + +/* For casting a unit into the various unit types */ +#define DEFINE_CAST(UPPERCASE, MixedCase) \ + static inline MixedCase* UPPERCASE(Unit *u) { \ + if (_unlikely_(!u || u->type != UNIT_##UPPERCASE)) \ + return NULL; \ + \ + return (MixedCase*) u; \ + } + +/* For casting the various unit types into a unit */ +#define UNIT(u) \ + ({ \ + typeof(u) _u_ = (u); \ + Unit *_w_ = _u_ ? &(_u_)->meta : NULL; \ + _w_; \ + }) + +#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0) +#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0) +#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0) + +static inline Unit* UNIT_TRIGGER(Unit *u) { + return hashmap_first_key(u->dependencies[UNIT_TRIGGERS]); +} + +Unit *unit_new(Manager *m, size_t size); +void unit_free(Unit *u); +DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free); + +int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret); +int unit_add_name(Unit *u, const char *name); + +int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask); +int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask); + +int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask); +int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask); + +int unit_add_exec_dependencies(Unit *u, ExecContext *c); + +int unit_choose_id(Unit *u, const char *name); +int unit_set_description(Unit *u, const char *description); + +bool unit_may_gc(Unit *u); + +void unit_add_to_load_queue(Unit *u); +void unit_add_to_dbus_queue(Unit *u); +void unit_add_to_cleanup_queue(Unit *u); +void unit_add_to_gc_queue(Unit *u); +void unit_add_to_target_deps_queue(Unit *u); +void unit_submit_to_stop_when_unneeded_queue(Unit *u); + +int unit_merge(Unit *u, Unit *other); +int unit_merge_by_name(Unit *u, const char *other); + +Unit *unit_follow_merge(Unit *u) _pure_; + +int unit_load_fragment_and_dropin(Unit *u); +int unit_load_fragment_and_dropin_optional(Unit *u); +int unit_load(Unit *unit); + +int unit_set_slice(Unit *u, Unit *slice); +int unit_set_default_slice(Unit *u); + +const char *unit_description(Unit *u) _pure_; + +bool unit_has_name(const Unit *u, const char *name); + +UnitActiveState unit_active_state(Unit *u); + +const char* unit_sub_state_to_string(Unit *u); + +void unit_dump(Unit *u, FILE *f, const char *prefix); + +bool unit_can_reload(Unit *u) _pure_; +bool unit_can_start(Unit *u) _pure_; +bool unit_can_stop(Unit *u) _pure_; +bool unit_can_isolate(Unit *u) _pure_; + +int unit_start(Unit *u); +int unit_stop(Unit *u); +int unit_reload(Unit *u); + +int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error); +int unit_kill_common(Unit *u, KillWho who, int signo, pid_t main_pid, pid_t control_pid, sd_bus_error *error); + +typedef enum UnitNotifyFlags { + UNIT_NOTIFY_RELOAD_FAILURE = 1 << 0, + UNIT_NOTIFY_WILL_AUTO_RESTART = 1 << 1, +} UnitNotifyFlags; + +void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags); + +int unit_watch_pid(Unit *u, pid_t pid); +void unit_unwatch_pid(Unit *u, pid_t pid); +void unit_unwatch_all_pids(Unit *u); + +int unit_enqueue_rewatch_pids(Unit *u); +void unit_dequeue_rewatch_pids(Unit *u); + +int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name); +int unit_watch_bus_name(Unit *u, const char *name); +void unit_unwatch_bus_name(Unit *u, const char *name); + +bool unit_job_is_applicable(Unit *u, JobType j); + +int set_unit_path(const char *p); + +char *unit_dbus_path(Unit *u); +char *unit_dbus_path_invocation_id(Unit *u); + +int unit_load_related_unit(Unit *u, const char *type, Unit **_found); + +bool unit_can_serialize(Unit *u) _pure_; + +int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs); +int unit_deserialize(Unit *u, FILE *f, FDSet *fds); +int unit_deserialize_skip(FILE *f); + +int unit_add_node_dependency(Unit *u, const char *what, bool wants, UnitDependency d, UnitDependencyMask mask); + +int unit_coldplug(Unit *u); +void unit_catchup(Unit *u); + +void unit_status_printf(Unit *u, const char *status, const char *unit_status_msg_format) _printf_(3, 0); + +bool unit_need_daemon_reload(Unit *u); + +void unit_reset_failed(Unit *u); + +Unit *unit_following(Unit *u); +int unit_following_set(Unit *u, Set **s); + +const char *unit_slice_name(Unit *u); + +bool unit_stop_pending(Unit *u) _pure_; +bool unit_inactive_or_pending(Unit *u) _pure_; +bool unit_active_or_pending(Unit *u); +bool unit_will_restart(Unit *u); + +int unit_add_default_target_dependency(Unit *u, Unit *target); + +void unit_start_on_failure(Unit *u); +void unit_trigger_notify(Unit *u); + +UnitFileState unit_get_unit_file_state(Unit *u); +int unit_get_unit_file_preset(Unit *u); + +Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target); +void unit_ref_unset(UnitRef *ref); + +#define UNIT_DEREF(ref) ((ref).target) +#define UNIT_ISSET(ref) (!!(ref).target) + +int unit_patch_contexts(Unit *u); + +ExecContext *unit_get_exec_context(Unit *u) _pure_; +KillContext *unit_get_kill_context(Unit *u) _pure_; +CGroupContext *unit_get_cgroup_context(Unit *u) _pure_; + +ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_; + +int unit_setup_exec_runtime(Unit *u); +int unit_setup_dynamic_creds(Unit *u); + +char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf); +char* unit_concat_strv(char **l, UnitWriteFlags flags); + +int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data); +int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5); + +int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien); + +int unit_make_transient(Unit *u); + +int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask); + +bool unit_type_supported(UnitType t); + +bool unit_is_pristine(Unit *u); + +bool unit_is_unneeded(Unit *u); + +pid_t unit_control_pid(Unit *u); +pid_t unit_main_pid(Unit *u); + +static inline bool unit_supported(Unit *u) { + return unit_type_supported(u->type); +} + +void unit_warn_if_dir_nonempty(Unit *u, const char* where); +int unit_fail_if_noncanonical(Unit *u, const char* where); + +int unit_start_limit_test(Unit *u); + +void unit_unref_uid(Unit *u, bool destroy_now); +int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc); + +void unit_unref_gid(Unit *u, bool destroy_now); +int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc); + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid); +void unit_unref_uid_gid(Unit *u, bool destroy_now); + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid); + +int unit_set_invocation_id(Unit *u, sd_id128_t id); +int unit_acquire_invocation_id(Unit *u); + +bool unit_shall_confirm_spawn(Unit *u); + +int unit_set_exec_params(Unit *s, ExecParameters *p); + +int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret); + +void unit_remove_dependencies(Unit *u, UnitDependencyMask mask); + +void unit_export_state_files(Unit *u); +void unit_unlink_state_files(Unit *u); + +int unit_prepare_exec(Unit *u); + +void unit_warn_leftover_processes(Unit *u); + +bool unit_needs_console(Unit *u); + +const char *unit_label_path(Unit *u); + +int unit_pid_attachable(Unit *unit, pid_t pid, sd_bus_error *error); + +void unit_log_success(Unit *u); +void unit_log_failure(Unit *u, const char *result); +static inline void unit_log_result(Unit *u, bool success, const char *result) { + if (success) + unit_log_success(u); + else + unit_log_failure(u, result); +} + +void unit_log_process_exit(Unit *u, int level, const char *kind, const char *command, int code, int status); + +int unit_exit_status(Unit *u); +int unit_success_action_exit_status(Unit *u); +int unit_failure_action_exit_status(Unit *u); + +/* Macros which append UNIT= or USER_UNIT= to the message */ + +#define log_unit_full(unit, level, error, ...) \ + ({ \ + const Unit *_u = (unit); \ + _u ? log_object_internal(level, error, __FILE__, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \ + log_internal(level, error, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + }) + +#define log_unit_debug(unit, ...) log_unit_full(unit, LOG_DEBUG, 0, ##__VA_ARGS__) +#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, 0, ##__VA_ARGS__) +#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, 0, ##__VA_ARGS__) +#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, 0, ##__VA_ARGS__) +#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, 0, ##__VA_ARGS__) + +#define log_unit_debug_errno(unit, error, ...) log_unit_full(unit, LOG_DEBUG, error, ##__VA_ARGS__) +#define log_unit_info_errno(unit, error, ...) log_unit_full(unit, LOG_INFO, error, ##__VA_ARGS__) +#define log_unit_notice_errno(unit, error, ...) log_unit_full(unit, LOG_NOTICE, error, ##__VA_ARGS__) +#define log_unit_warning_errno(unit, error, ...) log_unit_full(unit, LOG_WARNING, error, ##__VA_ARGS__) +#define log_unit_error_errno(unit, error, ...) log_unit_full(unit, LOG_ERR, error, ##__VA_ARGS__) + +#define LOG_UNIT_MESSAGE(unit, fmt, ...) "MESSAGE=%s: " fmt, (unit)->id, ##__VA_ARGS__ +#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id +#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string + +const char* collect_mode_to_string(CollectMode m) _const_; +CollectMode collect_mode_from_string(const char *s) _pure_; diff --git a/src/core/user.conf b/src/core/user.conf new file mode 100644 index 0000000..b427f1e --- /dev/null +++ b/src/core/user.conf @@ -0,0 +1,44 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# You can override the directives in this file by creating files in +# /etc/systemd/user.conf.d/*.conf. +# +# See systemd-user.conf(5) for details + +[Manager] +#LogLevel=info +#LogTarget=console +#LogColor=yes +#LogLocation=no +#SystemCallArchitectures= +#TimerSlackNSec= +#DefaultTimerAccuracySec=1min +#DefaultStandardOutput=inherit +#DefaultStandardError=inherit +#DefaultTimeoutStartSec=90s +#DefaultTimeoutStopSec=90s +#DefaultRestartSec=100ms +#DefaultStartLimitIntervalSec=10s +#DefaultStartLimitBurst=5 +#DefaultEnvironment= +#DefaultLimitCPU= +#DefaultLimitFSIZE= +#DefaultLimitDATA= +#DefaultLimitSTACK= +#DefaultLimitCORE= +#DefaultLimitRSS= +#DefaultLimitNOFILE= +#DefaultLimitAS= +#DefaultLimitNPROC= +#DefaultLimitMEMLOCK= +#DefaultLimitLOCKS= +#DefaultLimitSIGPENDING= +#DefaultLimitMSGQUEUE= +#DefaultLimitNICE= +#DefaultLimitRTPRIO= +#DefaultLimitRTTIME= diff --git a/src/coredump/coredump-vacuum.c b/src/coredump/coredump-vacuum.c new file mode 100644 index 0000000..6ce5dfc --- /dev/null +++ b/src/coredump/coredump-vacuum.c @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <sys/statvfs.h> + +#include "alloc-util.h" +#include "coredump-vacuum.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "macro.h" +#include "string-util.h" +#include "time-util.h" +#include "user-util.h" +#include "util.h" + +#define DEFAULT_MAX_USE_LOWER (uint64_t) (1ULL*1024ULL*1024ULL) /* 1 MiB */ +#define DEFAULT_MAX_USE_UPPER (uint64_t) (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */ +#define DEFAULT_KEEP_FREE_UPPER (uint64_t) (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */ +#define DEFAULT_KEEP_FREE (uint64_t) (1024ULL*1024ULL) /* 1 MB */ + +struct vacuum_candidate { + unsigned n_files; + char *oldest_file; + usec_t oldest_mtime; +}; + +static void vacuum_candidate_free(struct vacuum_candidate *c) { + if (!c) + return; + + free(c->oldest_file); + free(c); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct vacuum_candidate*, vacuum_candidate_free); + +static void vacuum_candidate_hashmap_free(Hashmap *h) { + hashmap_free_with_destructor(h, vacuum_candidate_free); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Hashmap*, vacuum_candidate_hashmap_free); + +static int uid_from_file_name(const char *filename, uid_t *uid) { + const char *p, *e, *u; + + p = startswith(filename, "core."); + if (!p) + return -EINVAL; + + /* Skip the comm field */ + p = strchr(p, '.'); + if (!p) + return -EINVAL; + p++; + + /* Find end up UID */ + e = strchr(p, '.'); + if (!e) + return -EINVAL; + + u = strndupa(p, e-p); + return parse_uid(u, uid); +} + +static bool vacuum_necessary(int fd, uint64_t sum, uint64_t keep_free, uint64_t max_use) { + uint64_t fs_size = 0, fs_free = (uint64_t) -1; + struct statvfs sv; + + assert(fd >= 0); + + if (fstatvfs(fd, &sv) >= 0) { + fs_size = sv.f_frsize * sv.f_blocks; + fs_free = sv.f_frsize * sv.f_bfree; + } + + if (max_use == (uint64_t) -1) { + + if (fs_size > 0) { + max_use = PAGE_ALIGN(fs_size / 10); /* 10% */ + + if (max_use > DEFAULT_MAX_USE_UPPER) + max_use = DEFAULT_MAX_USE_UPPER; + + if (max_use < DEFAULT_MAX_USE_LOWER) + max_use = DEFAULT_MAX_USE_LOWER; + } else + max_use = DEFAULT_MAX_USE_LOWER; + } else + max_use = PAGE_ALIGN(max_use); + + if (max_use > 0 && sum > max_use) + return true; + + if (keep_free == (uint64_t) -1) { + + if (fs_size > 0) { + keep_free = PAGE_ALIGN((fs_size * 3) / 20); /* 15% */ + + if (keep_free > DEFAULT_KEEP_FREE_UPPER) + keep_free = DEFAULT_KEEP_FREE_UPPER; + } else + keep_free = DEFAULT_KEEP_FREE; + } else + keep_free = PAGE_ALIGN(keep_free); + + if (keep_free > 0 && fs_free < keep_free) + return true; + + return false; +} + +int coredump_vacuum(int exclude_fd, uint64_t keep_free, uint64_t max_use) { + _cleanup_closedir_ DIR *d = NULL; + struct stat exclude_st; + int r; + + if (keep_free == 0 && max_use == 0) + return 0; + + if (exclude_fd >= 0) { + if (fstat(exclude_fd, &exclude_st) < 0) + return log_error_errno(errno, "Failed to fstat(): %m"); + } + + /* This algorithm will keep deleting the oldest file of the + * user with the most coredumps until we are back in the size + * limits. Note that vacuuming for journal files is different, + * because we rely on rate-limiting of the messages there, + * to avoid being flooded. */ + + d = opendir("/var/lib/systemd/coredump"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Can't open coredump directory: %m"); + } + + for (;;) { + _cleanup_(vacuum_candidate_hashmap_freep) Hashmap *h = NULL; + struct vacuum_candidate *worst = NULL; + struct dirent *de; + uint64_t sum = 0; + + rewinddir(d); + + FOREACH_DIRENT(de, d, goto fail) { + struct vacuum_candidate *c; + struct stat st; + uid_t uid; + usec_t t; + + r = uid_from_file_name(de->d_name, &uid); + if (r < 0) + continue; + + if (fstatat(dirfd(d), de->d_name, &st, AT_NO_AUTOMOUNT|AT_SYMLINK_NOFOLLOW) < 0) { + if (errno == ENOENT) + continue; + + log_warning_errno(errno, "Failed to stat /var/lib/systemd/coredump/%s: %m", de->d_name); + continue; + } + + if (!S_ISREG(st.st_mode)) + continue; + + if (exclude_fd >= 0 && + exclude_st.st_dev == st.st_dev && + exclude_st.st_ino == st.st_ino) + continue; + + r = hashmap_ensure_allocated(&h, NULL); + if (r < 0) + return log_oom(); + + t = timespec_load(&st.st_mtim); + + c = hashmap_get(h, UID_TO_PTR(uid)); + if (c) { + + if (t < c->oldest_mtime) { + char *n; + + n = strdup(de->d_name); + if (!n) + return log_oom(); + + free(c->oldest_file); + c->oldest_file = n; + c->oldest_mtime = t; + } + + } else { + _cleanup_(vacuum_candidate_freep) struct vacuum_candidate *n = NULL; + + n = new0(struct vacuum_candidate, 1); + if (!n) + return log_oom(); + + n->oldest_file = strdup(de->d_name); + if (!n->oldest_file) + return log_oom(); + + n->oldest_mtime = t; + + r = hashmap_put(h, UID_TO_PTR(uid), n); + if (r < 0) + return log_oom(); + + c = TAKE_PTR(n); + } + + c->n_files++; + + if (!worst || + worst->n_files < c->n_files || + (worst->n_files == c->n_files && c->oldest_mtime < worst->oldest_mtime)) + worst = c; + + sum += st.st_blocks * 512; + } + + if (!worst) + break; + + r = vacuum_necessary(dirfd(d), sum, keep_free, max_use); + if (r <= 0) + return r; + + r = unlinkat_deallocate(dirfd(d), worst->oldest_file, 0); + if (r == -ENOENT) + continue; + if (r < 0) + return log_error_errno(r, "Failed to remove file %s: %m", worst->oldest_file); + + log_info("Removed old coredump %s.", worst->oldest_file); + } + + return 0; + +fail: + return log_error_errno(errno, "Failed to read directory: %m"); +} diff --git a/src/coredump/coredump-vacuum.h b/src/coredump/coredump-vacuum.h new file mode 100644 index 0000000..0db1167 --- /dev/null +++ b/src/coredump/coredump-vacuum.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include <inttypes.h> +#include <sys/types.h> + +int coredump_vacuum(int exclude_fd, uint64_t keep_free, uint64_t max_use); diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c new file mode 100644 index 0000000..ecbb4bf --- /dev/null +++ b/src/coredump/coredump.c @@ -0,0 +1,1405 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <errno.h> +#include <stdio.h> +#include <stdio_ext.h> +#include <sys/prctl.h> +#include <sys/xattr.h> +#include <unistd.h> + +#if HAVE_ELFUTILS +#include <dwarf.h> +#include <elfutils/libdwfl.h> +#endif + +#include "sd-daemon.h" +#include "sd-journal.h" +#include "sd-login.h" +#include "sd-messages.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "compress.h" +#include "conf-parser.h" +#include "copy.h" +#include "coredump-vacuum.h" +#include "dirent-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "journal-importer.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "missing.h" +#include "mkdir.h" +#include "parse-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "special.h" +#include "stacktrace.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "user-util.h" +#include "util.h" + +/* The maximum size up to which we process coredumps */ +#define PROCESS_SIZE_MAX ((uint64_t) (2LLU*1024LLU*1024LLU*1024LLU)) + +/* The maximum size up to which we leave the coredump around on disk */ +#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX + +/* The maximum size up to which we store the coredump in the journal */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU)) +#else +/* oss-fuzz limits memory usage. */ +#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU)) +#endif + +/* Make sure to not make this larger than the maximum journal entry + * size. See DATA_SIZE_MAX in journal-importer.h. */ +assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX); + +enum { + /* We use this as array indexes for a couple of special fields we use for + * naming coredump files, and attaching xattrs, and for indexing argv[]. + + * Our pattern for man:systectl(1) kernel.core_pattern is such that the + * kernel passes fields until CONTEXT_RLIMIT as arguments in argv[]. After + * that it gets complicated: the kernel passes "comm" as one or more fields + * starting at index CONTEXT_COMM (in other words, full "comm" is under index + * CONTEXT_COMM when it does not contain spaces, which is the common + * case). This mapping is not reversible, so we prefer to retrieve "comm" + * from /proc. We only fall back to argv[CONTEXT_COMM...] when that fails. + * + * In the internal context[] array, fields before CONTEXT_COMM are the + * strings from argv[], so they should not be freed. The strings at indices + * CONTEXT_COMM and higher are allocated by us and should be freed at the + * end. + */ + CONTEXT_PID, + CONTEXT_UID, + CONTEXT_GID, + CONTEXT_SIGNAL, + CONTEXT_TIMESTAMP, + CONTEXT_RLIMIT, + CONTEXT_HOSTNAME, + CONTEXT_COMM, + CONTEXT_EXE, + CONTEXT_UNIT, + _CONTEXT_MAX +}; + +typedef enum CoredumpStorage { + COREDUMP_STORAGE_NONE, + COREDUMP_STORAGE_EXTERNAL, + COREDUMP_STORAGE_JOURNAL, + _COREDUMP_STORAGE_MAX, + _COREDUMP_STORAGE_INVALID = -1 +} CoredumpStorage; + +static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = { + [COREDUMP_STORAGE_NONE] = "none", + [COREDUMP_STORAGE_EXTERNAL] = "external", + [COREDUMP_STORAGE_JOURNAL] = "journal", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage); +static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage, "Failed to parse storage setting"); + +static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL; +static bool arg_compress = true; +static uint64_t arg_process_size_max = PROCESS_SIZE_MAX; +static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX; +static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX; +static uint64_t arg_keep_free = (uint64_t) -1; +static uint64_t arg_max_use = (uint64_t) -1; + +static int parse_config(void) { + static const ConfigTableItem items[] = { + { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage }, + { "Coredump", "Compress", config_parse_bool, 0, &arg_compress }, + { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max }, + { "Coredump", "ExternalSizeMax", config_parse_iec_uint64, 0, &arg_external_size_max }, + { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max }, + { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free }, + { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use }, + {} + }; + + return config_parse_many_nulstr(PKGSYSCONFDIR "/coredump.conf", + CONF_PATHS_NULSTR("systemd/coredump.conf.d"), + "Coredump\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, NULL); +} + +static uint64_t storage_size_max(void) { + if (arg_storage == COREDUMP_STORAGE_EXTERNAL) + return arg_external_size_max; + if (arg_storage == COREDUMP_STORAGE_JOURNAL) + return arg_journal_size_max; + assert(arg_storage == COREDUMP_STORAGE_NONE); + return 0; +} + +static int fix_acl(int fd, uid_t uid) { + +#if HAVE_ACL + _cleanup_(acl_freep) acl_t acl = NULL; + acl_entry_t entry; + acl_permset_t permset; + int r; + + assert(fd >= 0); + + if (uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY) + return 0; + + /* Make sure normal users can read (but not write or delete) + * their own coredumps */ + + acl = acl_get_fd(fd); + if (!acl) + return log_error_errno(errno, "Failed to get ACL: %m"); + + if (acl_create_entry(&acl, &entry) < 0 || + acl_set_tag_type(entry, ACL_USER) < 0 || + acl_set_qualifier(entry, &uid) < 0) + return log_error_errno(errno, "Failed to patch ACL: %m"); + + if (acl_get_permset(entry, &permset) < 0 || + acl_add_perm(permset, ACL_READ) < 0) + return log_warning_errno(errno, "Failed to patch ACL: %m"); + + r = calc_acl_mask_if_needed(&acl); + if (r < 0) + return log_warning_errno(r, "Failed to patch ACL: %m"); + + if (acl_set_fd(fd, acl) < 0) + return log_error_errno(errno, "Failed to apply ACL: %m"); +#endif + + return 0; +} + +static int fix_xattr(int fd, const char *context[_CONTEXT_MAX]) { + + static const char * const xattrs[_CONTEXT_MAX] = { + [CONTEXT_PID] = "user.coredump.pid", + [CONTEXT_UID] = "user.coredump.uid", + [CONTEXT_GID] = "user.coredump.gid", + [CONTEXT_SIGNAL] = "user.coredump.signal", + [CONTEXT_TIMESTAMP] = "user.coredump.timestamp", + [CONTEXT_RLIMIT] = "user.coredump.rlimit", + [CONTEXT_HOSTNAME] = "user.coredump.hostname", + [CONTEXT_COMM] = "user.coredump.comm", + [CONTEXT_EXE] = "user.coredump.exe", + }; + + int r = 0; + unsigned i; + + assert(fd >= 0); + + /* Attach some metadata to coredumps via extended + * attributes. Just because we can. */ + + for (i = 0; i < _CONTEXT_MAX; i++) { + int k; + + if (isempty(context[i]) || !xattrs[i]) + continue; + + k = fsetxattr(fd, xattrs[i], context[i], strlen(context[i]), XATTR_CREATE); + if (k < 0 && r == 0) + r = -errno; + } + + return r; +} + +#define filename_escape(s) xescape((s), "./ ") + +static const char *coredump_tmpfile_name(const char *s) { + return s ? s : "(unnamed temporary file)"; +} + +static int fix_permissions( + int fd, + const char *filename, + const char *target, + const char *context[_CONTEXT_MAX], + uid_t uid) { + + int r; + + assert(fd >= 0); + assert(target); + assert(context); + + /* Ignore errors on these */ + (void) fchmod(fd, 0640); + (void) fix_acl(fd, uid); + (void) fix_xattr(fd, context); + + if (fsync(fd) < 0) + return log_error_errno(errno, "Failed to sync coredump %s: %m", coredump_tmpfile_name(filename)); + + (void) fsync_directory_of_file(fd); + + r = link_tmpfile(fd, filename, target); + if (r < 0) + return log_error_errno(r, "Failed to move coredump %s into place: %m", target); + + return 0; +} + +static int maybe_remove_external_coredump(const char *filename, uint64_t size) { + + /* Returns 1 if might remove, 0 if will not remove, < 0 on error. */ + + if (arg_storage == COREDUMP_STORAGE_EXTERNAL && + size <= arg_external_size_max) + return 0; + + if (!filename) + return 1; + + if (unlink(filename) < 0 && errno != ENOENT) + return log_error_errno(errno, "Failed to unlink %s: %m", filename); + + return 1; +} + +static int make_filename(const char *context[_CONTEXT_MAX], char **ret) { + _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL; + sd_id128_t boot = {}; + int r; + + assert(context); + + c = filename_escape(context[CONTEXT_COMM]); + if (!c) + return -ENOMEM; + + u = filename_escape(context[CONTEXT_UID]); + if (!u) + return -ENOMEM; + + r = sd_id128_get_boot(&boot); + if (r < 0) + return r; + + p = filename_escape(context[CONTEXT_PID]); + if (!p) + return -ENOMEM; + + t = filename_escape(context[CONTEXT_TIMESTAMP]); + if (!t) + return -ENOMEM; + + if (asprintf(ret, + "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s000000", + c, + u, + SD_ID128_FORMAT_VAL(boot), + p, + t) < 0) + return -ENOMEM; + + return 0; +} + +static int save_external_coredump( + const char *context[_CONTEXT_MAX], + int input_fd, + char **ret_filename, + int *ret_node_fd, + int *ret_data_fd, + uint64_t *ret_size, + bool *ret_truncated) { + + _cleanup_free_ char *fn = NULL, *tmp = NULL; + _cleanup_close_ int fd = -1; + uint64_t rlimit, process_limit, max_size; + struct stat st; + uid_t uid; + int r; + + assert(context); + assert(ret_filename); + assert(ret_node_fd); + assert(ret_data_fd); + assert(ret_size); + + r = parse_uid(context[CONTEXT_UID], &uid); + if (r < 0) + return log_error_errno(r, "Failed to parse UID: %m"); + + r = safe_atou64(context[CONTEXT_RLIMIT], &rlimit); + if (r < 0) + return log_error_errno(r, "Failed to parse resource limit '%s': %m", context[CONTEXT_RLIMIT]); + if (rlimit < page_size()) { + /* Is coredumping disabled? Then don't bother saving/processing the coredump. + * Anything below PAGE_SIZE cannot give a readable coredump (the kernel uses + * ELF_EXEC_PAGESIZE which is not easily accessible, but is usually the same as PAGE_SIZE. */ + return log_info_errno(SYNTHETIC_ERRNO(EBADSLT), + "Resource limits disable core dumping for process %s (%s).", + context[CONTEXT_PID], context[CONTEXT_COMM]); + } + + process_limit = MAX(arg_process_size_max, storage_size_max()); + if (process_limit == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT), + "Limits for coredump processing and storage are both 0, not dumping core."); + + /* Never store more than the process configured, or than we actually shall keep or process */ + max_size = MIN(rlimit, process_limit); + + r = make_filename(context, &fn); + if (r < 0) + return log_error_errno(r, "Failed to determine coredump file name: %m"); + + mkdir_p_label("/var/lib/systemd/coredump", 0755); + + fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp); + if (fd < 0) + return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn); + + r = copy_bytes(input_fd, fd, max_size, 0); + if (r < 0) { + log_error_errno(r, "Cannot store coredump of %s (%s): %m", context[CONTEXT_PID], context[CONTEXT_COMM]); + goto fail; + } + *ret_truncated = r == 1; + if (*ret_truncated) + log_struct(LOG_INFO, + LOG_MESSAGE("Core file was truncated to %zu bytes.", max_size), + "SIZE_LIMIT=%zu", max_size, + "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR); + + if (fstat(fd, &st) < 0) { + log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp)); + goto fail; + } + + if (lseek(fd, 0, SEEK_SET) == (off_t) -1) { + log_error_errno(errno, "Failed to seek on %s: %m", coredump_tmpfile_name(tmp)); + goto fail; + } + +#if HAVE_XZ || HAVE_LZ4 + /* If we will remove the coredump anyway, do not compress. */ + if (arg_compress && !maybe_remove_external_coredump(NULL, st.st_size)) { + + _cleanup_free_ char *fn_compressed = NULL, *tmp_compressed = NULL; + _cleanup_close_ int fd_compressed = -1; + + fn_compressed = strappend(fn, COMPRESSED_EXT); + if (!fn_compressed) { + log_oom(); + goto uncompressed; + } + + fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed); + if (fd_compressed < 0) { + log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed); + goto uncompressed; + } + + r = compress_stream(fd, fd_compressed, -1); + if (r < 0) { + log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed)); + goto fail_compressed; + } + + r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, uid); + if (r < 0) + goto fail_compressed; + + /* OK, this worked, we can get rid of the uncompressed version now */ + if (tmp) + unlink_noerrno(tmp); + + *ret_filename = TAKE_PTR(fn_compressed); /* compressed */ + *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */ + *ret_data_fd = TAKE_FD(fd); /* uncompressed */ + *ret_size = (uint64_t) st.st_size; /* uncompressed */ + + return 0; + + fail_compressed: + if (tmp_compressed) + (void) unlink(tmp_compressed); + } + +uncompressed: +#endif + + r = fix_permissions(fd, tmp, fn, context, uid); + if (r < 0) + goto fail; + + *ret_filename = TAKE_PTR(fn); + *ret_data_fd = TAKE_FD(fd); + *ret_node_fd = -1; + *ret_size = (uint64_t) st.st_size; + + return 0; + +fail: + if (tmp) + (void) unlink(tmp); + return r; +} + +static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) { + _cleanup_free_ char *field = NULL; + ssize_t n; + + assert(fd >= 0); + assert(ret); + assert(ret_size); + + if (lseek(fd, 0, SEEK_SET) == (off_t) -1) + return log_warning_errno(errno, "Failed to seek: %m"); + + field = malloc(9 + size); + if (!field) { + log_warning("Failed to allocate memory for coredump, coredump will not be stored."); + return -ENOMEM; + } + + memcpy(field, "COREDUMP=", 9); + + n = read(fd, field + 9, size); + if (n < 0) + return log_error_errno((int) n, "Failed to read core data: %m"); + if ((size_t) n < size) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Core data too short."); + + *ret = TAKE_PTR(field); + *ret_size = size + 9; + + return 0; +} + +/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines: + * 0:/dev/pts/23 + * pos: 0 + * flags: 0100002 + * + * 1:/dev/pts/23 + * pos: 0 + * flags: 0100002 + * + * 2:/dev/pts/23 + * pos: 0 + * flags: 0100002 + * EOF + */ +static int compose_open_fds(pid_t pid, char **open_fds) { + _cleanup_closedir_ DIR *proc_fd_dir = NULL; + _cleanup_close_ int proc_fdinfo_fd = -1; + _cleanup_free_ char *buffer = NULL; + _cleanup_fclose_ FILE *stream = NULL; + const char *fddelim = "", *path; + struct dirent *dent = NULL; + size_t size = 0; + int r; + + assert(pid >= 0); + assert(open_fds != NULL); + + path = procfs_file_alloca(pid, "fd"); + proc_fd_dir = opendir(path); + if (!proc_fd_dir) + return -errno; + + proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (proc_fdinfo_fd < 0) + return -errno; + + stream = open_memstream(&buffer, &size); + if (!stream) + return -ENOMEM; + + (void) __fsetlocking(stream, FSETLOCKING_BYCALLER); + + FOREACH_DIRENT(dent, proc_fd_dir, return -errno) { + _cleanup_fclose_ FILE *fdinfo = NULL; + _cleanup_free_ char *fdname = NULL; + int fd; + + r = readlinkat_malloc(dirfd(proc_fd_dir), dent->d_name, &fdname); + if (r < 0) + return r; + + fprintf(stream, "%s%s:%s\n", fddelim, dent->d_name, fdname); + fddelim = "\n"; + + /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */ + fd = openat(proc_fdinfo_fd, dent->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY); + if (fd < 0) + continue; + + fdinfo = fdopen(fd, "r"); + if (!fdinfo) { + safe_close(fd); + continue; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(fdinfo, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + fputs(line, stream); + fputc('\n', stream); + } + } + + errno = 0; + stream = safe_fclose(stream); + + if (errno > 0) + return -errno; + + *open_fds = TAKE_PTR(buffer); + + return 0; +} + +static int get_process_ns(pid_t pid, const char *namespace, ino_t *ns) { + const char *p; + struct stat stbuf; + _cleanup_close_ int proc_ns_dir_fd; + + p = procfs_file_alloca(pid, "ns"); + + proc_ns_dir_fd = open(p, O_DIRECTORY | O_CLOEXEC | O_RDONLY); + if (proc_ns_dir_fd < 0) + return -errno; + + if (fstatat(proc_ns_dir_fd, namespace, &stbuf, /* flags */0) < 0) + return -errno; + + *ns = stbuf.st_ino; + return 0; +} + +static int get_mount_namespace_leader(pid_t pid, pid_t *container_pid) { + pid_t cpid = pid, ppid = 0; + ino_t proc_mntns; + int r = 0; + + r = get_process_ns(pid, "mnt", &proc_mntns); + if (r < 0) + return r; + + for (;;) { + ino_t parent_mntns; + + r = get_process_ppid(cpid, &ppid); + if (r < 0) + return r; + + r = get_process_ns(ppid, "mnt", &parent_mntns); + if (r < 0) + return r; + + if (proc_mntns != parent_mntns) + break; + + if (ppid == 1) + return -ENOENT; + + cpid = ppid; + } + + *container_pid = ppid; + return 0; +} + +/* Returns 1 if the parent was found. + * Returns 0 if there is not a process we can call the pid's + * container parent (the pid's process isn't 'containerized'). + * Returns a negative number on errors. + */ +static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) { + int r = 0; + pid_t container_pid; + const char *proc_root_path; + struct stat root_stat, proc_root_stat; + + /* To compare inodes of / and /proc/[pid]/root */ + if (stat("/", &root_stat) < 0) + return -errno; + + proc_root_path = procfs_file_alloca(pid, "root"); + if (stat(proc_root_path, &proc_root_stat) < 0) + return -errno; + + /* The process uses system root. */ + if (proc_root_stat.st_ino == root_stat.st_ino) { + *cmdline = NULL; + return 0; + } + + r = get_mount_namespace_leader(pid, &container_pid); + if (r < 0) + return r; + + r = get_process_cmdline(container_pid, 0, false, cmdline); + if (r < 0) + return r; + + return 1; +} + +static int change_uid_gid(const char *context[]) { + uid_t uid; + gid_t gid; + int r; + + r = parse_uid(context[CONTEXT_UID], &uid); + if (r < 0) + return r; + + if (uid <= SYSTEM_UID_MAX) { + const char *user = "systemd-coredump"; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) { + log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user); + uid = gid = 0; + } + } else { + r = parse_gid(context[CONTEXT_GID], &gid); + if (r < 0) + return r; + } + + return drop_privileges(uid, gid, 0); +} + +static bool is_journald_crash(const char *context[_CONTEXT_MAX]) { + assert(context); + + return streq_ptr(context[CONTEXT_UNIT], SPECIAL_JOURNALD_SERVICE); +} + +static bool is_pid1_crash(const char *context[_CONTEXT_MAX]) { + assert(context); + + return streq_ptr(context[CONTEXT_UNIT], SPECIAL_INIT_SCOPE) || + streq_ptr(context[CONTEXT_PID], "1"); +} + +#define SUBMIT_COREDUMP_FIELDS 4 + +static int submit_coredump( + const char *context[_CONTEXT_MAX], + struct iovec *iovec, + size_t n_iovec_allocated, + size_t n_iovec, + int input_fd) { + + _cleanup_close_ int coredump_fd = -1, coredump_node_fd = -1; + _cleanup_free_ char *core_message = NULL, *filename = NULL, *coredump_data = NULL; + uint64_t coredump_size = UINT64_MAX; + bool truncated = false, journald_crash; + int r; + + assert(context); + assert(iovec); + assert(n_iovec_allocated >= n_iovec + SUBMIT_COREDUMP_FIELDS); + assert(input_fd >= 0); + + journald_crash = is_journald_crash(context); + + /* Vacuum before we write anything again */ + (void) coredump_vacuum(-1, arg_keep_free, arg_max_use); + + /* Always stream the coredump to disk, if that's possible */ + r = save_external_coredump(context, input_fd, + &filename, &coredump_node_fd, &coredump_fd, &coredump_size, &truncated); + if (r < 0) + /* Skip whole core dumping part */ + goto log; + + /* If we don't want to keep the coredump on disk, remove it now, as later on we will lack the privileges for + * it. However, we keep the fd to it, so that we can still process it and log it. */ + r = maybe_remove_external_coredump(filename, coredump_size); + if (r < 0) + return r; + if (r == 0) { + const char *coredump_filename; + + coredump_filename = strjoina("COREDUMP_FILENAME=", filename); + iovec[n_iovec++] = IOVEC_MAKE_STRING(coredump_filename); + } else if (arg_storage == COREDUMP_STORAGE_EXTERNAL) + log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", + coredump_size, arg_external_size_max); + + /* Vacuum again, but exclude the coredump we just created */ + (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use); + + /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the coredump + * memory under the user's uid. This also ensures that the credentials journald will see are the ones of the + * coredumping user, thus making sure the user gets access to the core dump. Let's also get rid of all + * capabilities, if we run as root, we won't need them anymore. */ + r = change_uid_gid(context); + if (r < 0) + return log_error_errno(r, "Failed to drop privileges: %m"); + +#if HAVE_ELFUTILS + /* Try to get a strack trace if we can */ + if (coredump_size <= arg_process_size_max) { + _cleanup_free_ char *stacktrace = NULL; + + r = coredump_make_stack_trace(coredump_fd, context[CONTEXT_EXE], &stacktrace); + if (r >= 0) + core_message = strjoin("MESSAGE=Process ", context[CONTEXT_PID], + " (", context[CONTEXT_COMM], ") of user ", + context[CONTEXT_UID], " dumped core.", + journald_crash ? "\nCoredump diverted to " : "", + journald_crash ? filename : "", + "\n\n", stacktrace); + else if (r == -EINVAL) + log_warning("Failed to generate stack trace: %s", dwfl_errmsg(dwfl_errno())); + else + log_warning_errno(r, "Failed to generate stack trace: %m"); + } else + log_debug("Not generating stack trace: core size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", + coredump_size, arg_process_size_max); + + if (!core_message) +#endif +log: + core_message = strjoin("MESSAGE=Process ", context[CONTEXT_PID], + " (", context[CONTEXT_COMM], ") of user ", + context[CONTEXT_UID], " dumped core.", + journald_crash && filename ? "\nCoredump diverted to " : NULL, + journald_crash && filename ? filename : NULL); + if (!core_message) + return log_oom(); + + if (journald_crash) { + /* We cannot log to the journal, so just print the message. + * The target was set previously to something safe. */ + assert(startswith(core_message, "MESSAGE=")); + log_dispatch(LOG_ERR, 0, core_message + strlen("MESSAGE=")); + return 0; + } + + iovec[n_iovec++] = IOVEC_MAKE_STRING(core_message); + + if (truncated) + iovec[n_iovec++] = IOVEC_MAKE_STRING("COREDUMP_TRUNCATED=1"); + + /* Optionally store the entire coredump in the journal */ + if (arg_storage == COREDUMP_STORAGE_JOURNAL) { + if (coredump_size <= arg_journal_size_max) { + size_t sz = 0; + + /* Store the coredump itself in the journal */ + + r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz); + if (r >= 0) + iovec[n_iovec++] = IOVEC_MAKE(coredump_data, sz); + else + log_warning_errno(r, "Failed to attach the core to the journal entry: %m"); + } else + log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", + coredump_size, arg_journal_size_max); + } + + assert(n_iovec <= n_iovec_allocated); + + r = sd_journal_sendv(iovec, n_iovec); + if (r < 0) + return log_error_errno(r, "Failed to log coredump: %m"); + + return 0; +} + +static void map_context_fields(const struct iovec *iovec, const char* context[]) { + + static const char * const context_field_names[] = { + [CONTEXT_PID] = "COREDUMP_PID=", + [CONTEXT_UID] = "COREDUMP_UID=", + [CONTEXT_GID] = "COREDUMP_GID=", + [CONTEXT_SIGNAL] = "COREDUMP_SIGNAL=", + [CONTEXT_TIMESTAMP] = "COREDUMP_TIMESTAMP=", + [CONTEXT_RLIMIT] = "COREDUMP_RLIMIT=", + [CONTEXT_HOSTNAME] = "COREDUMP_HOSTNAME=", + [CONTEXT_COMM] = "COREDUMP_COMM=", + [CONTEXT_EXE] = "COREDUMP_EXE=", + }; + + unsigned i; + + assert(iovec); + assert(context); + + for (i = 0; i < ELEMENTSOF(context_field_names); i++) { + char *p; + + if (!context_field_names[i]) + continue; + + p = memory_startswith(iovec->iov_base, iovec->iov_len, context_field_names[i]); + if (!p) + continue; + + /* Note that these strings are NUL terminated, because we made sure that a trailing NUL byte is in the + * buffer, though not included in the iov_len count. (see below) */ + context[i] = p; + break; + } +} + +static int process_socket(int fd) { + _cleanup_close_ int coredump_fd = -1; + struct iovec *iovec = NULL; + size_t n_iovec = 0, n_allocated = 0, i, k; + const char *context[_CONTEXT_MAX] = {}; + int r; + + assert(fd >= 0); + + log_setup_service(); + + log_debug("Processing coredump received on stdin..."); + + for (;;) { + union { + struct cmsghdr cmsghdr; + uint8_t buf[CMSG_SPACE(sizeof(int))]; + } control = {}; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_iovlen = 1, + }; + ssize_t n; + ssize_t l; + + if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec + SUBMIT_COREDUMP_FIELDS)) { + r = log_oom(); + goto finish; + } + + l = next_datagram_size_fd(fd); + if (l < 0) { + r = log_error_errno(l, "Failed to determine datagram size to read: %m"); + goto finish; + } + + assert(l >= 0); + + iovec[n_iovec].iov_len = l; + iovec[n_iovec].iov_base = malloc(l + 1); + if (!iovec[n_iovec].iov_base) { + r = log_oom(); + goto finish; + } + + mh.msg_iov = iovec + n_iovec; + + n = recvmsg(fd, &mh, MSG_CMSG_CLOEXEC); + if (n < 0) { + free(iovec[n_iovec].iov_base); + r = log_error_errno(errno, "Failed to receive datagram: %m"); + goto finish; + } + + if (n == 0) { + struct cmsghdr *cmsg, *found = NULL; + /* The final zero-length datagram carries the file descriptor and tells us that we're done. */ + + free(iovec[n_iovec].iov_base); + + CMSG_FOREACH(cmsg, &mh) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS && + cmsg->cmsg_len == CMSG_LEN(sizeof(int))) { + assert(!found); + found = cmsg; + } + } + + if (!found) { + log_error("Coredump file descriptor missing."); + r = -EBADMSG; + goto finish; + } + + assert(coredump_fd < 0); + coredump_fd = *(int*) CMSG_DATA(found); + break; + } + + /* Add trailing NUL byte, in case these are strings */ + ((char*) iovec[n_iovec].iov_base)[n] = 0; + iovec[n_iovec].iov_len = (size_t) n; + + cmsg_close_all(&mh); + map_context_fields(iovec + n_iovec, context); + n_iovec++; + } + + if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec + SUBMIT_COREDUMP_FIELDS)) { + r = log_oom(); + goto finish; + } + + /* Make sure we got all data we really need */ + assert(context[CONTEXT_PID]); + assert(context[CONTEXT_UID]); + assert(context[CONTEXT_GID]); + assert(context[CONTEXT_SIGNAL]); + assert(context[CONTEXT_TIMESTAMP]); + assert(context[CONTEXT_RLIMIT]); + assert(context[CONTEXT_HOSTNAME]); + assert(context[CONTEXT_COMM]); + assert(coredump_fd >= 0); + + /* Small quirk: the journal fields contain the timestamp padded with six zeroes, so that the kernel-supplied 1s + * granularity timestamps becomes 1µs granularity, i.e. the granularity systemd usually operates in. Since we + * are reconstructing the original kernel context, we chop this off again, here. */ + k = strlen(context[CONTEXT_TIMESTAMP]); + if (k > 6) + context[CONTEXT_TIMESTAMP] = strndupa(context[CONTEXT_TIMESTAMP], k - 6); + + r = submit_coredump(context, iovec, n_allocated, n_iovec, coredump_fd); + +finish: + for (i = 0; i < n_iovec; i++) + free(iovec[i].iov_base); + free(iovec); + + return r; +} + +static int send_iovec(const struct iovec iovec[], size_t n_iovec, int input_fd) { + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/coredump", + }; + _cleanup_close_ int fd = -1; + size_t i; + int r; + + assert(iovec || n_iovec <= 0); + assert(input_fd >= 0); + + fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to create coredump socket: %m"); + + if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) + return log_error_errno(errno, "Failed to connect to coredump service: %m"); + + for (i = 0; i < n_iovec; i++) { + struct msghdr mh = { + .msg_iov = (struct iovec*) iovec + i, + .msg_iovlen = 1, + }; + struct iovec copy[2]; + + for (;;) { + if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0) + break; + + if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) { + /* This field didn't fit? That's a pity. Given that this is just metadata, + * let's truncate the field at half, and try again. We append three dots, in + * order to show that this is truncated. */ + + if (mh.msg_iov != copy) { + /* We don't want to modify the caller's iovec, hence let's create our + * own array, consisting of two new iovecs, where the first is a + * (truncated) copy of what we want to send, and the second one + * contains the trailing dots. */ + copy[0] = iovec[i]; + copy[1] = (struct iovec) { + .iov_base = (char[]) { '.', '.', '.' }, + .iov_len = 3, + }; + + mh.msg_iov = copy; + mh.msg_iovlen = 2; + } + + copy[0].iov_len /= 2; /* halve it, and try again */ + continue; + } + + return log_error_errno(errno, "Failed to send coredump datagram: %m"); + } + } + + r = send_one_fd(fd, input_fd, 0); + if (r < 0) + return log_error_errno(r, "Failed to send coredump fd: %m"); + + return 0; +} + +static char* set_iovec_field_free(struct iovec *iovec, size_t *n_iovec, const char *field, char *value) { + char *x; + + x = set_iovec_string_field(iovec, n_iovec, field, value); + free(value); + return x; +} + +static int gather_pid_metadata( + char* context[_CONTEXT_MAX], + char **comm_fallback, + struct iovec *iovec, size_t *n_iovec) { + + /* We need 27 empty slots in iovec! + * + * Note that if we fail on oom later on, we do not roll-back changes to the iovec structure. (It remains valid, + * with the first n_iovec fields initialized.) */ + + uid_t owner_uid; + pid_t pid; + char *t; + const char *p; + int r, signo; + + r = parse_pid(context[CONTEXT_PID], &pid); + if (r < 0) + return log_error_errno(r, "Failed to parse PID \"%s\": %m", context[CONTEXT_PID]); + + r = get_process_comm(pid, &context[CONTEXT_COMM]); + if (r < 0) { + log_warning_errno(r, "Failed to get COMM, falling back to the command line: %m"); + context[CONTEXT_COMM] = strv_join(comm_fallback, " "); + if (!context[CONTEXT_COMM]) + return log_oom(); + } + + r = get_process_exe(pid, &context[CONTEXT_EXE]); + if (r < 0) + log_warning_errno(r, "Failed to get EXE, ignoring: %m"); + + if (cg_pid_get_unit(pid, &context[CONTEXT_UNIT]) >= 0) { + if (!is_journald_crash((const char**) context)) { + /* OK, now we know it's not the journal, hence we can make use of it now. */ + log_set_target(LOG_TARGET_JOURNAL_OR_KMSG); + log_open(); + } + + /* If this is PID 1 disable coredump collection, we'll unlikely be able to process it later on. */ + if (is_pid1_crash((const char**) context)) { + log_notice("Due to PID 1 having crashed coredump collection will now be turned off."); + disable_coredumps(); + } + + set_iovec_string_field(iovec, n_iovec, "COREDUMP_UNIT=", context[CONTEXT_UNIT]); + } + + if (cg_pid_get_user_unit(pid, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_USER_UNIT=", t); + + /* The next few are mandatory */ + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_PID=", context[CONTEXT_PID])) + return log_oom(); + + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_UID=", context[CONTEXT_UID])) + return log_oom(); + + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_GID=", context[CONTEXT_GID])) + return log_oom(); + + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_SIGNAL=", context[CONTEXT_SIGNAL])) + return log_oom(); + + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_RLIMIT=", context[CONTEXT_RLIMIT])) + return log_oom(); + + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_HOSTNAME=", context[CONTEXT_HOSTNAME])) + return log_oom(); + + if (!set_iovec_string_field(iovec, n_iovec, "COREDUMP_COMM=", context[CONTEXT_COMM])) + return log_oom(); + + if (context[CONTEXT_EXE] && + !set_iovec_string_field(iovec, n_iovec, "COREDUMP_EXE=", context[CONTEXT_EXE])) + return log_oom(); + + if (sd_pid_get_session(pid, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_SESSION=", t); + + if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) { + r = asprintf(&t, "COREDUMP_OWNER_UID=" UID_FMT, owner_uid); + if (r > 0) + iovec[(*n_iovec)++] = IOVEC_MAKE_STRING(t); + } + + if (sd_pid_get_slice(pid, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_SLICE=", t); + + if (get_process_cmdline(pid, 0, false, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_CMDLINE=", t); + + if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_CGROUP=", t); + + if (compose_open_fds(pid, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_OPEN_FDS=", t); + + p = procfs_file_alloca(pid, "status"); + if (read_full_file(p, &t, NULL) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_PROC_STATUS=", t); + + p = procfs_file_alloca(pid, "maps"); + if (read_full_file(p, &t, NULL) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_PROC_MAPS=", t); + + p = procfs_file_alloca(pid, "limits"); + if (read_full_file(p, &t, NULL) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_PROC_LIMITS=", t); + + p = procfs_file_alloca(pid, "cgroup"); + if (read_full_file(p, &t, NULL) >=0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_PROC_CGROUP=", t); + + p = procfs_file_alloca(pid, "mountinfo"); + if (read_full_file(p, &t, NULL) >=0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_PROC_MOUNTINFO=", t); + + if (get_process_cwd(pid, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_CWD=", t); + + if (get_process_root(pid, &t) >= 0) { + bool proc_self_root_is_slash; + + proc_self_root_is_slash = strcmp(t, "/") == 0; + + set_iovec_field_free(iovec, n_iovec, "COREDUMP_ROOT=", t); + + /* If the process' root is "/", then there is a chance it has + * mounted own root and hence being containerized. */ + if (proc_self_root_is_slash && get_process_container_parent_cmdline(pid, &t) > 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_CONTAINER_CMDLINE=", t); + } + + if (get_process_environ(pid, &t) >= 0) + set_iovec_field_free(iovec, n_iovec, "COREDUMP_ENVIRON=", t); + + t = strjoin("COREDUMP_TIMESTAMP=", context[CONTEXT_TIMESTAMP], "000000"); + if (t) + iovec[(*n_iovec)++] = IOVEC_MAKE_STRING(t); + + if (safe_atoi(context[CONTEXT_SIGNAL], &signo) >= 0 && SIGNAL_VALID(signo)) + set_iovec_string_field(iovec, n_iovec, "COREDUMP_SIGNAL_NAME=SIG", signal_to_string(signo)); + + return 0; /* we successfully acquired all metadata */ +} + +static int process_kernel(int argc, char* argv[]) { + + char* context[_CONTEXT_MAX] = {}; + struct iovec iovec[29 + SUBMIT_COREDUMP_FIELDS]; + size_t i, n_iovec, n_to_free = 0; + int r; + + log_debug("Processing coredump received from the kernel..."); + + if (argc < CONTEXT_COMM + 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Not enough arguments passed by the kernel (%i, expected %i).", + argc - 1, CONTEXT_COMM + 1 - 1); + + context[CONTEXT_PID] = argv[1 + CONTEXT_PID]; + context[CONTEXT_UID] = argv[1 + CONTEXT_UID]; + context[CONTEXT_GID] = argv[1 + CONTEXT_GID]; + context[CONTEXT_SIGNAL] = argv[1 + CONTEXT_SIGNAL]; + context[CONTEXT_TIMESTAMP] = argv[1 + CONTEXT_TIMESTAMP]; + context[CONTEXT_RLIMIT] = argv[1 + CONTEXT_RLIMIT]; + context[CONTEXT_HOSTNAME] = argv[1 + CONTEXT_HOSTNAME]; + + r = gather_pid_metadata(context, argv + 1 + CONTEXT_COMM, iovec, &n_to_free); + if (r < 0) + goto finish; + + n_iovec = n_to_free; + + iovec[n_iovec++] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR); + + assert_cc(2 == LOG_CRIT); + iovec[n_iovec++] = IOVEC_MAKE_STRING("PRIORITY=2"); + + assert(n_iovec <= ELEMENTSOF(iovec)); + + if (is_journald_crash((const char**) context) || is_pid1_crash((const char**) context)) + r = submit_coredump((const char**) context, + iovec, ELEMENTSOF(iovec), n_iovec, + STDIN_FILENO); + else + r = send_iovec(iovec, n_iovec, STDIN_FILENO); + + finish: + for (i = 0; i < n_to_free; i++) + free(iovec[i].iov_base); + + /* Those fields are allocated by gather_pid_metadata */ + free(context[CONTEXT_COMM]); + free(context[CONTEXT_EXE]); + free(context[CONTEXT_UNIT]); + + return r; +} + +static int process_backtrace(int argc, char *argv[]) { + char *context[_CONTEXT_MAX] = {}; + _cleanup_free_ char *message = NULL; + _cleanup_free_ struct iovec *iovec = NULL; + size_t n_iovec, n_allocated, n_to_free = 0, i; + int r; + JournalImporter importer = { + .fd = STDIN_FILENO, + }; + + log_debug("Processing backtrace on stdin..."); + + if (argc < CONTEXT_COMM + 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Not enough arguments passed (%i, expected %i).", + argc - 1, CONTEXT_COMM + 1 - 1); + + context[CONTEXT_PID] = argv[2 + CONTEXT_PID]; + context[CONTEXT_UID] = argv[2 + CONTEXT_UID]; + context[CONTEXT_GID] = argv[2 + CONTEXT_GID]; + context[CONTEXT_SIGNAL] = argv[2 + CONTEXT_SIGNAL]; + context[CONTEXT_TIMESTAMP] = argv[2 + CONTEXT_TIMESTAMP]; + context[CONTEXT_RLIMIT] = argv[2 + CONTEXT_RLIMIT]; + context[CONTEXT_HOSTNAME] = argv[2 + CONTEXT_HOSTNAME]; + + n_allocated = 34 + COREDUMP_STORAGE_EXTERNAL; + /* 26 metadata, 2 static, +unknown input, 4 storage, rounded up */ + iovec = new(struct iovec, n_allocated); + if (!iovec) + return log_oom(); + + r = gather_pid_metadata(context, argv + 2 + CONTEXT_COMM, iovec, &n_to_free); + if (r < 0) + goto finish; + if (r > 0) { + /* This was a special crash, and has already been processed. */ + r = 0; + goto finish; + } + n_iovec = n_to_free; + + for (;;) { + r = journal_importer_process_data(&importer); + if (r < 0) { + log_error_errno(r, "Failed to parse journal entry on stdin: %m"); + goto finish; + } + if (r == 1 || /* complete entry */ + journal_importer_eof(&importer)) /* end of data */ + break; + } + + if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec + importer.iovw.count + 2)) + return log_oom(); + + if (journal_importer_eof(&importer)) { + log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter"); + + message = strjoin("MESSAGE=Process ", context[CONTEXT_PID], + " (", context[CONTEXT_COMM], ")" + " of user ", context[CONTEXT_UID], + " failed with ", context[CONTEXT_SIGNAL]); + if (!message) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(message); + } else { + for (i = 0; i < importer.iovw.count; i++) + iovec[n_iovec++] = importer.iovw.iovec[i]; + } + + iovec[n_iovec++] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR); + assert_cc(2 == LOG_CRIT); + iovec[n_iovec++] = IOVEC_MAKE_STRING("PRIORITY=2"); + + assert(n_iovec <= n_allocated); + + r = sd_journal_sendv(iovec, n_iovec); + if (r < 0) + log_error_errno(r, "Failed to log backtrace: %m"); + + finish: + for (i = 0; i < n_to_free; i++) + free(iovec[i].iov_base); + + /* Those fields are allocated by gather_pid_metadata */ + free(context[CONTEXT_COMM]); + free(context[CONTEXT_EXE]); + free(context[CONTEXT_UNIT]); + + return r; +} + +static int run(int argc, char *argv[]) { + int r; + + /* First, log to a safe place, since we don't know what crashed and it might + * be journald which we'd rather not log to then. */ + + log_set_target(LOG_TARGET_KMSG); + log_open(); + + /* Make sure we never enter a loop */ + (void) prctl(PR_SET_DUMPABLE, 0); + + /* Ignore all parse errors */ + (void) parse_config(); + + log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage)); + log_debug("Selected compression %s.", yes_no(arg_compress)); + + r = sd_listen_fds(false); + if (r < 0) + return log_error_errno(r, "Failed to determine the number of file descriptors: %m"); + + /* If we got an fd passed, we are running in coredumpd mode. Otherwise we + * are invoked from the kernel as coredump handler. */ + if (r == 0) { + if (streq_ptr(argv[1], "--backtrace")) + return process_backtrace(argc, argv); + else + return process_kernel(argc, argv); + } else if (r == 1) + return process_socket(SD_LISTEN_FDS_START); + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Received unexpected number of file descriptors."); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/coredump/coredump.conf b/src/coredump/coredump.conf new file mode 100644 index 0000000..c2f0643 --- /dev/null +++ b/src/coredump/coredump.conf @@ -0,0 +1,21 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# Entries in this file show the compile time defaults. +# You can change settings by editing this file. +# Defaults can be restored by simply deleting this file. +# +# See coredump.conf(5) for details. + +[Coredump] +#Storage=external +#Compress=yes +#ProcessSizeMax=2G +#ExternalSizeMax=2G +#JournalSizeMax=767M +#MaxUse= +#KeepFree= diff --git a/src/coredump/coredumpctl.c b/src/coredump/coredumpctl.c new file mode 100644 index 0000000..fbee242 --- /dev/null +++ b/src/coredump/coredumpctl.c @@ -0,0 +1,1096 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <fcntl.h> +#include <getopt.h> +#include <locale.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "sd-bus.h" +#include "sd-journal.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "compress.h" +#include "def.h" +#include "fd-util.h" +#include "fs-util.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "pager.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "user-util.h" +#include "util.h" +#include "verbs.h" + +#define SHORT_BUS_CALL_TIMEOUT_USEC (3 * USEC_PER_SEC) + +static usec_t arg_since = USEC_INFINITY, arg_until = USEC_INFINITY; +static const char* arg_field = NULL; +static const char *arg_debugger = NULL; +static const char *arg_directory = NULL; +static PagerFlags arg_pager_flags = 0; +static int arg_no_legend = false; +static int arg_one = false; +static const char* arg_output = NULL; +static bool arg_reverse = false; +static bool arg_quiet = false; + +static int add_match(sd_journal *j, const char *match) { + _cleanup_free_ char *p = NULL; + const char* prefix, *pattern; + pid_t pid; + int r; + + if (strchr(match, '=')) + prefix = ""; + else if (strchr(match, '/')) { + r = path_make_absolute_cwd(match, &p); + if (r < 0) + return log_error_errno(r, "path_make_absolute_cwd(\"%s\"): %m", match); + + match = p; + prefix = "COREDUMP_EXE="; + } else if (parse_pid(match, &pid) >= 0) + prefix = "COREDUMP_PID="; + else + prefix = "COREDUMP_COMM="; + + pattern = strjoina(prefix, match); + log_debug("Adding match: %s", pattern); + r = sd_journal_add_match(j, pattern, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match \"%s\": %m", match); + + return 0; +} + +static int add_matches(sd_journal *j, char **matches) { + char **match; + int r; + + r = sd_journal_add_match(j, "MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match \"%s\": %m", "MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR); + + r = sd_journal_add_match(j, "MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match \"%s\": %m", "MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR); + + STRV_FOREACH(match, matches) { + r = add_match(j, *match); + if (r < 0) + return r; + } + + return 0; +} + +static int acquire_journal(sd_journal **ret, char **matches) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert(ret); + + if (arg_directory) { + r = sd_journal_open_directory(&j, arg_directory, 0); + if (r < 0) + return log_error_errno(r, "Failed to open journals in directory: %s: %m", arg_directory); + } else { + r = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to open journal: %m"); + } + + r = journal_access_check_and_warn(j, arg_quiet, true); + if (r < 0) + return r; + + r = add_matches(j, matches); + if (r < 0) + return r; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *filter; + + filter = journal_make_match_string(j); + log_debug("Journal filter: %s", filter); + } + + *ret = TAKE_PTR(j); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("coredumpctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "List or retrieve coredumps from the journal.\n\n" + "Flags:\n" + " -h --help Show this help\n" + " --version Print version string\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not print the column headers\n" + " --debugger=DEBUGGER Use the given debugger\n" + " -1 Show information about most recent entry only\n" + " -S --since=DATE Only print coredumps since the date\n" + " -U --until=DATE Only print coredumps until the date\n" + " -r --reverse Show the newest entries first\n" + " -F --field=FIELD List all values a certain field takes\n" + " -o --output=FILE Write output to FILE\n" + " -D --directory=DIR Use journal files from directory\n\n" + " -q --quiet Do not show info messages and privilege warning\n" + "Commands:\n" + " list [MATCHES...] List available coredumps (default)\n" + " info [MATCHES...] Show detailed information about one or more coredumps\n" + " dump [MATCHES...] Print first matching coredump to stdout\n" + " debug [MATCHES...] Start a debugger for the first matching coredump\n" + "\nSee the %s for details.\n" + , program_invocation_short_name + , link + ); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_DEBUGGER, + }; + + int c, r; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version" , no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "debugger", required_argument, NULL, ARG_DEBUGGER }, + { "output", required_argument, NULL, 'o' }, + { "field", required_argument, NULL, 'F' }, + { "directory", required_argument, NULL, 'D' }, + { "reverse", no_argument, NULL, 'r' }, + { "since", required_argument, NULL, 'S' }, + { "until", required_argument, NULL, 'U' }, + { "quiet", no_argument, NULL, 'q' }, + {} + }; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "ho:F:1D:rS:U:q", options, NULL)) >= 0) + switch(c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_no_legend = true; + break; + + case ARG_DEBUGGER: + arg_debugger = optarg; + break; + + case 'o': + if (arg_output) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot set output more than once."); + + arg_output = optarg; + break; + + case 'S': + r = parse_timestamp(optarg, &arg_since); + if (r < 0) + return log_error_errno(r, "Failed to parse timestamp '%s': %m", optarg); + break; + + case 'U': + r = parse_timestamp(optarg, &arg_until); + if (r < 0) + return log_error_errno(r, "Failed to parse timestamp '%s': %m", optarg); + break; + + case 'F': + if (arg_field) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use --field/-F more than once."); + arg_field = optarg; + break; + + case '1': + arg_one = true; + break; + + case 'D': + arg_directory = optarg; + break; + + case 'r': + arg_reverse = true; + break; + + case 'q': + arg_quiet = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached("Unhandled option"); + } + + if (arg_since != USEC_INFINITY && arg_until != USEC_INFINITY && + arg_since > arg_until) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--since= must be before --until=."); + + return 1; +} + +static int retrieve(const void *data, + size_t len, + const char *name, + char **var) { + + size_t ident; + char *v; + + ident = strlen(name) + 1; /* name + "=" */ + + if (len < ident) + return 0; + + if (memcmp(data, name, ident - 1) != 0) + return 0; + + if (((const char*) data)[ident - 1] != '=') + return 0; + + v = strndup((const char*)data + ident, len - ident); + if (!v) + return log_oom(); + + free(*var); + *var = v; + + return 1; +} + +static int print_field(FILE* file, sd_journal *j) { + const void *d; + size_t l; + + assert(file); + assert(j); + + assert(arg_field); + + /* A (user-specified) field may appear more than once for a given entry. + * We will print all of the occurences. + * This is different below for fields that systemd-coredump uses, + * because they cannot meaningfully appear more than once. + */ + SD_JOURNAL_FOREACH_DATA(j, d, l) { + _cleanup_free_ char *value = NULL; + int r; + + r = retrieve(d, l, arg_field, &value); + if (r < 0) + return r; + if (r > 0) + fprintf(file, "%s\n", value); + } + + return 0; +} + +#define RETRIEVE(d, l, name, arg) \ + { \ + int _r = retrieve(d, l, name, &arg); \ + if (_r < 0) \ + return _r; \ + if (_r > 0) \ + continue; \ + } + +static int print_list(FILE* file, sd_journal *j, int had_legend) { + _cleanup_free_ char + *mid = NULL, *pid = NULL, *uid = NULL, *gid = NULL, + *sgnl = NULL, *exe = NULL, *comm = NULL, *cmdline = NULL, + *filename = NULL, *truncated = NULL, *coredump = NULL; + const void *d; + size_t l; + usec_t t; + char buf[FORMAT_TIMESTAMP_MAX]; + int r; + const char *present; + bool normal_coredump; + + assert(file); + assert(j); + + SD_JOURNAL_FOREACH_DATA(j, d, l) { + RETRIEVE(d, l, "MESSAGE_ID", mid); + RETRIEVE(d, l, "COREDUMP_PID", pid); + RETRIEVE(d, l, "COREDUMP_UID", uid); + RETRIEVE(d, l, "COREDUMP_GID", gid); + RETRIEVE(d, l, "COREDUMP_SIGNAL", sgnl); + RETRIEVE(d, l, "COREDUMP_EXE", exe); + RETRIEVE(d, l, "COREDUMP_COMM", comm); + RETRIEVE(d, l, "COREDUMP_CMDLINE", cmdline); + RETRIEVE(d, l, "COREDUMP_FILENAME", filename); + RETRIEVE(d, l, "COREDUMP_TRUNCATED", truncated); + RETRIEVE(d, l, "COREDUMP", coredump); + } + + if (!pid && !uid && !gid && !sgnl && !exe && !comm && !cmdline && !filename) { + log_warning("Empty coredump log entry"); + return -EINVAL; + } + + r = sd_journal_get_realtime_usec(j, &t); + if (r < 0) + return log_error_errno(r, "Failed to get realtime timestamp: %m"); + + format_timestamp(buf, sizeof(buf), t); + + if (!had_legend && !arg_no_legend) + fprintf(file, "%-*s %*s %*s %*s %*s %-*s %s\n", + FORMAT_TIMESTAMP_WIDTH, "TIME", + 6, "PID", + 5, "UID", + 5, "GID", + 3, "SIG", + 9, "COREFILE", + "EXE"); + + normal_coredump = streq_ptr(mid, SD_MESSAGE_COREDUMP_STR); + + if (filename) + if (access(filename, R_OK) == 0) + present = "present"; + else if (errno == ENOENT) + present = "missing"; + else + present = "error"; + else if (coredump) + present = "journal"; + else if (normal_coredump) + present = "none"; + else + present = "-"; + + if (STR_IN_SET(present, "present", "journal") && truncated && parse_boolean(truncated) > 0) + present = "truncated"; + + fprintf(file, "%-*s %*s %*s %*s %*s %-*s %s\n", + FORMAT_TIMESTAMP_WIDTH, buf, + 6, strna(pid), + 5, strna(uid), + 5, strna(gid), + 3, normal_coredump ? strna(sgnl) : "-", + 9, present, + strna(exe ?: (comm ?: cmdline))); + + return 0; +} + +static int print_info(FILE *file, sd_journal *j, bool need_space) { + _cleanup_free_ char + *mid = NULL, *pid = NULL, *uid = NULL, *gid = NULL, + *sgnl = NULL, *exe = NULL, *comm = NULL, *cmdline = NULL, + *unit = NULL, *user_unit = NULL, *session = NULL, + *boot_id = NULL, *machine_id = NULL, *hostname = NULL, + *slice = NULL, *cgroup = NULL, *owner_uid = NULL, + *message = NULL, *timestamp = NULL, *filename = NULL, + *truncated = NULL, *coredump = NULL; + const void *d; + size_t l; + bool normal_coredump; + int r; + + assert(file); + assert(j); + + SD_JOURNAL_FOREACH_DATA(j, d, l) { + RETRIEVE(d, l, "MESSAGE_ID", mid); + RETRIEVE(d, l, "COREDUMP_PID", pid); + RETRIEVE(d, l, "COREDUMP_UID", uid); + RETRIEVE(d, l, "COREDUMP_GID", gid); + RETRIEVE(d, l, "COREDUMP_SIGNAL", sgnl); + RETRIEVE(d, l, "COREDUMP_EXE", exe); + RETRIEVE(d, l, "COREDUMP_COMM", comm); + RETRIEVE(d, l, "COREDUMP_CMDLINE", cmdline); + RETRIEVE(d, l, "COREDUMP_UNIT", unit); + RETRIEVE(d, l, "COREDUMP_USER_UNIT", user_unit); + RETRIEVE(d, l, "COREDUMP_SESSION", session); + RETRIEVE(d, l, "COREDUMP_OWNER_UID", owner_uid); + RETRIEVE(d, l, "COREDUMP_SLICE", slice); + RETRIEVE(d, l, "COREDUMP_CGROUP", cgroup); + RETRIEVE(d, l, "COREDUMP_TIMESTAMP", timestamp); + RETRIEVE(d, l, "COREDUMP_FILENAME", filename); + RETRIEVE(d, l, "COREDUMP_TRUNCATED", truncated); + RETRIEVE(d, l, "COREDUMP", coredump); + RETRIEVE(d, l, "_BOOT_ID", boot_id); + RETRIEVE(d, l, "_MACHINE_ID", machine_id); + RETRIEVE(d, l, "_HOSTNAME", hostname); + RETRIEVE(d, l, "MESSAGE", message); + } + + if (need_space) + fputs("\n", file); + + normal_coredump = streq_ptr(mid, SD_MESSAGE_COREDUMP_STR); + + if (comm) + fprintf(file, + " PID: %s%s%s (%s)\n", + ansi_highlight(), strna(pid), ansi_normal(), comm); + else + fprintf(file, + " PID: %s%s%s\n", + ansi_highlight(), strna(pid), ansi_normal()); + + if (uid) { + uid_t n; + + if (parse_uid(uid, &n) >= 0) { + _cleanup_free_ char *u = NULL; + + u = uid_to_name(n); + fprintf(file, + " UID: %s (%s)\n", + uid, u); + } else { + fprintf(file, + " UID: %s\n", + uid); + } + } + + if (gid) { + gid_t n; + + if (parse_gid(gid, &n) >= 0) { + _cleanup_free_ char *g = NULL; + + g = gid_to_name(n); + fprintf(file, + " GID: %s (%s)\n", + gid, g); + } else { + fprintf(file, + " GID: %s\n", + gid); + } + } + + if (sgnl) { + int sig; + const char *name = normal_coredump ? "Signal" : "Reason"; + + if (normal_coredump && safe_atoi(sgnl, &sig) >= 0) + fprintf(file, " %s: %s (%s)\n", name, sgnl, signal_to_string(sig)); + else + fprintf(file, " %s: %s\n", name, sgnl); + } + + if (timestamp) { + usec_t u; + + r = safe_atou64(timestamp, &u); + if (r >= 0) { + char absolute[FORMAT_TIMESTAMP_MAX], relative[FORMAT_TIMESPAN_MAX]; + + fprintf(file, + " Timestamp: %s (%s)\n", + format_timestamp(absolute, sizeof(absolute), u), + format_timestamp_relative(relative, sizeof(relative), u)); + + } else + fprintf(file, " Timestamp: %s\n", timestamp); + } + + if (cmdline) + fprintf(file, " Command Line: %s\n", cmdline); + if (exe) + fprintf(file, " Executable: %s%s%s\n", ansi_highlight(), exe, ansi_normal()); + if (cgroup) + fprintf(file, " Control Group: %s\n", cgroup); + if (unit) + fprintf(file, " Unit: %s\n", unit); + if (user_unit) + fprintf(file, " User Unit: %s\n", user_unit); + if (slice) + fprintf(file, " Slice: %s\n", slice); + if (session) + fprintf(file, " Session: %s\n", session); + if (owner_uid) { + uid_t n; + + if (parse_uid(owner_uid, &n) >= 0) { + _cleanup_free_ char *u = NULL; + + u = uid_to_name(n); + fprintf(file, + " Owner UID: %s (%s)\n", + owner_uid, u); + } else { + fprintf(file, + " Owner UID: %s\n", + owner_uid); + } + } + if (boot_id) + fprintf(file, " Boot ID: %s\n", boot_id); + if (machine_id) + fprintf(file, " Machine ID: %s\n", machine_id); + if (hostname) + fprintf(file, " Hostname: %s\n", hostname); + + if (filename) { + bool inacc, trunc; + + inacc = access(filename, R_OK) < 0; + trunc = truncated && parse_boolean(truncated) > 0; + + if (inacc || trunc) + fprintf(file, " Storage: %s%s (%s%s%s)%s\n", + ansi_highlight_red(), + filename, + inacc ? "inaccessible" : "", + inacc && trunc ? ", " : "", + trunc ? "truncated" : "", + ansi_normal()); + else + fprintf(file, " Storage: %s\n", filename); + } + + else if (coredump) + fprintf(file, " Storage: journal\n"); + else + fprintf(file, " Storage: none\n"); + + if (message) { + _cleanup_free_ char *m = NULL; + + m = strreplace(message, "\n", "\n "); + + fprintf(file, " Message: %s\n", strstrip(m ?: message)); + } + + return 0; +} + +static int focus(sd_journal *j) { + int r; + + r = sd_journal_seek_tail(j); + if (r == 0) + r = sd_journal_previous(j); + if (r < 0) + return log_error_errno(r, "Failed to search journal: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "No match found."); + return r; +} + +static int print_entry(sd_journal *j, unsigned n_found, bool verb_is_info) { + assert(j); + + if (verb_is_info) + return print_info(stdout, j, n_found); + else if (arg_field) + return print_field(stdout, j); + else + return print_list(stdout, j, n_found); +} + +static int dump_list(int argc, char **argv, void *userdata) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + unsigned n_found = 0; + bool verb_is_info; + int r; + + verb_is_info = (argc >= 1 && streq(argv[0], "info")); + + r = acquire_journal(&j, argv + 1); + if (r < 0) + return r; + + (void) pager_open(arg_pager_flags); + + /* The coredumps are likely to compressed, and for just + * listing them we don't need to decompress them, so let's + * pick a fairly low data threshold here */ + sd_journal_set_data_threshold(j, 4096); + + /* "info" without pattern implies "-1" */ + if (arg_one || (verb_is_info && argc == 1)) { + r = focus(j); + if (r < 0) + return r; + + return print_entry(j, 0, verb_is_info); + } else { + if (arg_since != USEC_INFINITY && !arg_reverse) + r = sd_journal_seek_realtime_usec(j, arg_since); + else if (arg_until != USEC_INFINITY && arg_reverse) + r = sd_journal_seek_realtime_usec(j, arg_until); + else if (arg_reverse) + r = sd_journal_seek_tail(j); + else + r = sd_journal_seek_head(j); + if (r < 0) + return log_error_errno(r, "Failed to seek to date: %m"); + + for (;;) { + if (!arg_reverse) + r = sd_journal_next(j); + else + r = sd_journal_previous(j); + + if (r < 0) + return log_error_errno(r, "Failed to iterate through journal: %m"); + + if (r == 0) + break; + + if (arg_until != USEC_INFINITY && !arg_reverse) { + usec_t usec; + + r = sd_journal_get_realtime_usec(j, &usec); + if (r < 0) + return log_error_errno(r, "Failed to determine timestamp: %m"); + if (usec > arg_until) + continue; + } + + if (arg_since != USEC_INFINITY && arg_reverse) { + usec_t usec; + + r = sd_journal_get_realtime_usec(j, &usec); + if (r < 0) + return log_error_errno(r, "Failed to determine timestamp: %m"); + if (usec < arg_since) + continue; + } + + r = print_entry(j, n_found++, verb_is_info); + if (r < 0) + return r; + } + + if (!arg_field && n_found <= 0) { + if (!arg_quiet) + log_notice("No coredumps found."); + return -ESRCH; + } + } + + return 0; +} + +static int save_core(sd_journal *j, FILE *file, char **path, bool *unlink_temp) { + const char *data; + _cleanup_free_ char *filename = NULL; + size_t len; + int r, fd; + _cleanup_close_ int fdt = -1; + char *temp = NULL; + + assert(!(file && path)); /* At most one can be specified */ + assert(!!path == !!unlink_temp); /* Those must be specified together */ + + /* Look for a coredump on disk first. */ + r = sd_journal_get_data(j, "COREDUMP_FILENAME", (const void**) &data, &len); + if (r == 0) + retrieve(data, len, "COREDUMP_FILENAME", &filename); + else { + if (r != -ENOENT) + return log_error_errno(r, "Failed to retrieve COREDUMP_FILENAME field: %m"); + /* Check that we can have a COREDUMP field. We still haven't set a high + * data threshold, so we'll get a few kilobytes at most. + */ + + r = sd_journal_get_data(j, "COREDUMP", (const void**) &data, &len); + if (r == -ENOENT) + return log_error_errno(r, "Coredump entry has no core attached (neither internally in the journal nor externally on disk)."); + if (r < 0) + return log_error_errno(r, "Failed to retrieve COREDUMP field: %m"); + } + + if (filename) { + if (access(filename, R_OK) < 0) + return log_error_errno(errno, "File \"%s\" is not readable: %m", filename); + + if (path && !endswith(filename, ".xz") && !endswith(filename, ".lz4")) { + *path = TAKE_PTR(filename); + + return 0; + } + } + + if (path) { + const char *vt; + + /* Create a temporary file to write the uncompressed core to. */ + + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Failed to acquire temporary directory path: %m"); + + temp = strjoin(vt, "/coredump-XXXXXX"); + if (!temp) + return log_oom(); + + fdt = mkostemp_safe(temp); + if (fdt < 0) + return log_error_errno(fdt, "Failed to create temporary file: %m"); + log_debug("Created temporary file %s", temp); + + fd = fdt; + } else { + /* If neither path or file are specified, we will write to stdout. Let's now check + * if stdout is connected to a tty. We checked that the file exists, or that the + * core might be stored in the journal. In this second case, if we found the entry, + * in all likelyhood we will be able to access the COREDUMP= field. In either case, + * we stop before doing any "real" work, i.e. before starting decompression or + * reading from the file or creating temporary files. + */ + if (!file) { + if (on_tty()) + return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), + "Refusing to dump core to tty" + " (use shell redirection or specify --output)."); + file = stdout; + } + + fd = fileno(file); + } + + if (filename) { +#if HAVE_XZ || HAVE_LZ4 + _cleanup_close_ int fdf; + + fdf = open(filename, O_RDONLY | O_CLOEXEC); + if (fdf < 0) { + r = log_error_errno(errno, "Failed to open %s: %m", filename); + goto error; + } + + r = decompress_stream(filename, fdf, fd, -1); + if (r < 0) { + log_error_errno(r, "Failed to decompress %s: %m", filename); + goto error; + } +#else + log_error("Cannot decompress file. Compiled without compression support."); + r = -EOPNOTSUPP; + goto error; +#endif + } else { + ssize_t sz; + + /* We want full data, nothing truncated. */ + sd_journal_set_data_threshold(j, 0); + + r = sd_journal_get_data(j, "COREDUMP", (const void**) &data, &len); + if (r < 0) + return log_error_errno(r, "Failed to retrieve COREDUMP field: %m"); + + assert(len >= 9); + data += 9; + len -= 9; + + sz = write(fd, data, len); + if (sz < 0) { + r = log_error_errno(errno, "Failed to write output: %m"); + goto error; + } + if (sz != (ssize_t) len) { + log_error("Short write to output."); + r = -EIO; + goto error; + } + } + + if (temp) { + *path = temp; + *unlink_temp = true; + } + return 0; + +error: + if (temp) { + unlink(temp); + log_debug("Removed temporary file %s", temp); + } + return r; +} + +static int dump_core(int argc, char **argv, void *userdata) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + if (arg_field) { + log_error("Option --field/-F only makes sense with list"); + return -EINVAL; + } + + r = acquire_journal(&j, argv + 1); + if (r < 0) + return r; + + r = focus(j); + if (r < 0) + return r; + + if (arg_output) { + f = fopen(arg_output, "we"); + if (!f) + return log_error_errno(errno, "Failed to open \"%s\" for writing: %m", arg_output); + } + + print_info(f ? stdout : stderr, j, false); + + r = save_core(j, f, NULL, NULL); + if (r < 0) + return r; + + r = sd_journal_previous(j); + if (r > 0 && !arg_quiet) + log_notice("More than one entry matches, ignoring rest."); + + return 0; +} + +static int run_debug(int argc, char **argv, void *userdata) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_free_ char *exe = NULL, *path = NULL; + bool unlink_path = false; + const char *data, *fork_name; + size_t len; + pid_t pid; + int r; + + if (!arg_debugger) { + char *env_debugger; + + env_debugger = getenv("SYSTEMD_DEBUGGER"); + if (env_debugger) + arg_debugger = env_debugger; + else + arg_debugger = "gdb"; + } + + if (arg_field) { + log_error("Option --field/-F only makes sense with list"); + return -EINVAL; + } + + r = acquire_journal(&j, argv + 1); + if (r < 0) + return r; + + r = focus(j); + if (r < 0) + return r; + + print_info(stdout, j, false); + fputs("\n", stdout); + + r = sd_journal_get_data(j, "COREDUMP_EXE", (const void**) &data, &len); + if (r < 0) + return log_error_errno(r, "Failed to retrieve COREDUMP_EXE field: %m"); + + assert(len > STRLEN("COREDUMP_EXE=")); + data += STRLEN("COREDUMP_EXE="); + len -= STRLEN("COREDUMP_EXE="); + + exe = strndup(data, len); + if (!exe) + return log_oom(); + + if (endswith(exe, " (deleted)")) { + log_error("Binary already deleted."); + return -ENOENT; + } + + if (!path_is_absolute(exe)) { + log_error("Binary is not an absolute path."); + return -ENOENT; + } + + r = save_core(j, NULL, &path, &unlink_path); + if (r < 0) + return r; + + /* Don't interfere with gdb and its handling of SIGINT. */ + (void) ignore_signals(SIGINT, -1); + + fork_name = strjoina("(", arg_debugger, ")"); + + r = safe_fork(fork_name, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + goto finish; + if (r == 0) { + execlp(arg_debugger, arg_debugger, exe, "-c", path, NULL); + log_open(); + log_error_errno(errno, "Failed to invoke %s: %m", arg_debugger); + _exit(EXIT_FAILURE); + } + + r = wait_for_terminate_and_check(arg_debugger, pid, WAIT_LOG_ABNORMAL); + +finish: + (void) default_signals(SIGINT, -1); + + if (unlink_path) { + log_debug("Removed temporary file %s", path); + unlink(path); + } + + return r; +} + +static int check_units_active(void) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int c = 0, r; + const char *id, *state, *substate; + + if (arg_quiet) + return false; + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to acquire bus: %m"); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "ListUnitsByPatterns"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, NULL); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, STRV_MAKE("systemd-coredump@*.service")); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, SHORT_BUS_CALL_TIMEOUT_USEC, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to check if any systemd-coredump@.service units are running: %s", + bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read( + reply, "(ssssssouso)", + &id, NULL, NULL, &state, &substate, + NULL, NULL, NULL, NULL, NULL)) > 0) { + bool found = !STR_IN_SET(state, "inactive", "dead", "failed"); + log_debug("Unit %s is %s/%s, %scounting it.", id, state, substate, found ? "" : "not "); + c += found; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return c; +} + +static int coredumpctl_main(int argc, char *argv[]) { + + static const Verb verbs[] = { + { "list", VERB_ANY, VERB_ANY, VERB_DEFAULT, dump_list }, + { "info", VERB_ANY, VERB_ANY, 0, dump_list }, + { "dump", VERB_ANY, VERB_ANY, 0, dump_core }, + { "debug", VERB_ANY, VERB_ANY, 0, run_debug }, + { "gdb", VERB_ANY, VERB_ANY, 0, run_debug }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r, units_active; + + setlocale(LC_ALL, ""); + log_parse_environment(); + log_open(); + + /* The journal merging logic potentially needs a lot of fds. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + sigbus_install(); + + units_active = check_units_active(); /* error is treated the same as 0 */ + + r = coredumpctl_main(argc, argv); + + if (units_active > 0) + printf("%s-- Notice: %d systemd-coredump@.service %s, output may be incomplete.%s\n", + ansi_highlight_red(), + units_active, units_active == 1 ? "unit is running" : "units are running", + ansi_normal()); + return r; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/coredump/meson.build b/src/coredump/meson.build new file mode 100644 index 0000000..7fa5942 --- /dev/null +++ b/src/coredump/meson.build @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: LGPL-2.1+ + +systemd_coredump_sources = files(''' + coredump.c + coredump-vacuum.c + coredump-vacuum.h +'''.split()) + +if conf.get('HAVE_ELFUTILS') == 1 + systemd_coredump_sources += files(['stacktrace.c', + 'stacktrace.h']) +endif + +coredumpctl_sources = files('coredumpctl.c') + +if conf.get('ENABLE_COREDUMP') == 1 + install_data('coredump.conf', + install_dir : pkgsysconfdir) +endif + +tests += [ + [['src/coredump/test-coredump-vacuum.c', + 'src/coredump/coredump-vacuum.c', + 'src/coredump/coredump-vacuum.h'], + [], + [], + 'ENABLE_COREDUMP', 'manual'], +] diff --git a/src/coredump/stacktrace.c b/src/coredump/stacktrace.c new file mode 100644 index 0000000..dab4c1a --- /dev/null +++ b/src/coredump/stacktrace.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <dwarf.h> +#include <elfutils/libdwfl.h> +#include <stdio_ext.h> + +#include "alloc-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "macro.h" +#include "stacktrace.h" +#include "string-util.h" +#include "util.h" + +#define FRAMES_MAX 64 +#define THREADS_MAX 64 + +struct stack_context { + FILE *f; + Dwfl *dwfl; + Elf *elf; + unsigned n_thread; + unsigned n_frame; +}; + +static int frame_callback(Dwfl_Frame *frame, void *userdata) { + struct stack_context *c = userdata; + Dwarf_Addr pc, pc_adjusted, bias = 0; + _cleanup_free_ Dwarf_Die *scopes = NULL; + const char *fname = NULL, *symbol = NULL; + Dwfl_Module *module; + bool is_activation; + + assert(frame); + assert(c); + + if (c->n_frame >= FRAMES_MAX) + return DWARF_CB_ABORT; + + if (!dwfl_frame_pc(frame, &pc, &is_activation)) + return DWARF_CB_ABORT; + + pc_adjusted = pc - (is_activation ? 0 : 1); + + module = dwfl_addrmodule(c->dwfl, pc_adjusted); + if (module) { + Dwarf_Die *s, *cudie; + int n; + + cudie = dwfl_module_addrdie(module, pc_adjusted, &bias); + if (cudie) { + n = dwarf_getscopes(cudie, pc_adjusted - bias, &scopes); + for (s = scopes; s < scopes + n; s++) { + if (IN_SET(dwarf_tag(s), DW_TAG_subprogram, DW_TAG_inlined_subroutine, DW_TAG_entry_point)) { + Dwarf_Attribute *a, space; + + a = dwarf_attr_integrate(s, DW_AT_MIPS_linkage_name, &space); + if (!a) + a = dwarf_attr_integrate(s, DW_AT_linkage_name, &space); + if (a) + symbol = dwarf_formstring(a); + if (!symbol) + symbol = dwarf_diename(s); + + if (symbol) + break; + } + } + } + + if (!symbol) + symbol = dwfl_module_addrname(module, pc_adjusted); + + fname = dwfl_module_info(module, NULL, NULL, NULL, NULL, NULL, NULL, NULL); + } + + fprintf(c->f, "#%-2u 0x%016" PRIx64 " %s (%s)\n", c->n_frame, (uint64_t) pc, strna(symbol), strna(fname)); + c->n_frame++; + + return DWARF_CB_OK; +} + +static int thread_callback(Dwfl_Thread *thread, void *userdata) { + struct stack_context *c = userdata; + pid_t tid; + + assert(thread); + assert(c); + + if (c->n_thread >= THREADS_MAX) + return DWARF_CB_ABORT; + + if (c->n_thread != 0) + fputc('\n', c->f); + + c->n_frame = 0; + + tid = dwfl_thread_tid(thread); + fprintf(c->f, "Stack trace of thread " PID_FMT ":\n", tid); + + if (dwfl_thread_getframes(thread, frame_callback, c) < 0) + return DWARF_CB_ABORT; + + c->n_thread++; + + return DWARF_CB_OK; +} + +int coredump_make_stack_trace(int fd, const char *executable, char **ret) { + + static const Dwfl_Callbacks callbacks = { + .find_elf = dwfl_build_id_find_elf, + .find_debuginfo = dwfl_standard_find_debuginfo, + }; + + struct stack_context c = {}; + char *buf = NULL; + size_t sz = 0; + int r; + + assert(fd >= 0); + assert(ret); + + if (lseek(fd, 0, SEEK_SET) == (off_t) -1) + return -errno; + + c.f = open_memstream(&buf, &sz); + if (!c.f) + return -ENOMEM; + + (void) __fsetlocking(c.f, FSETLOCKING_BYCALLER); + + elf_version(EV_CURRENT); + + c.elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!c.elf) { + r = -EINVAL; + goto finish; + } + + c.dwfl = dwfl_begin(&callbacks); + if (!c.dwfl) { + r = -EINVAL; + goto finish; + } + + if (dwfl_core_file_report(c.dwfl, c.elf, executable) < 0) { + r = -EINVAL; + goto finish; + } + + if (dwfl_report_end(c.dwfl, NULL, NULL) != 0) { + r = -EINVAL; + goto finish; + } + + if (dwfl_core_file_attach(c.dwfl, c.elf) < 0) { + r = -EINVAL; + goto finish; + } + + if (dwfl_getthreads(c.dwfl, thread_callback, &c) < 0) { + r = -EINVAL; + goto finish; + } + + c.f = safe_fclose(c.f); + + *ret = TAKE_PTR(buf); + + r = 0; + +finish: + if (c.dwfl) + dwfl_end(c.dwfl); + + if (c.elf) + elf_end(c.elf); + + safe_fclose(c.f); + + free(buf); + + return r; +} diff --git a/src/coredump/stacktrace.h b/src/coredump/stacktrace.h new file mode 100644 index 0000000..5290042 --- /dev/null +++ b/src/coredump/stacktrace.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +int coredump_make_stack_trace(int fd, const char *executable, char **ret); diff --git a/src/coredump/test-coredump-vacuum.c b/src/coredump/test-coredump-vacuum.c new file mode 100644 index 0000000..75fb442 --- /dev/null +++ b/src/coredump/test-coredump-vacuum.c @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <stdlib.h> + +#include "coredump-vacuum.h" + +int main(int argc, char *argv[]) { + + if (coredump_vacuum(-1, (uint64_t) -1, 70 * 1024) < 0) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} |