From 55944e5e40b1be2afc4855d8d2baf4b73d1876b5 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 10 Apr 2024 22:49:52 +0200 Subject: Adding upstream version 255.4. Signed-off-by: Daniel Baumann --- src/core/all-units.h | 15 + src/core/apparmor-setup.c | 99 + src/core/apparmor-setup.h | 4 + src/core/audit-fd.c | 62 + src/core/audit-fd.h | 5 + src/core/automount.c | 1149 ++++ src/core/automount.h | 45 + src/core/bpf-devices.c | 505 ++ src/core/bpf-devices.h | 21 + src/core/bpf-firewall.c | 974 +++ src/core/bpf-firewall.h | 25 + src/core/bpf-foreign.c | 154 + src/core/bpf-foreign.h | 15 + src/core/bpf-lsm.c | 320 + src/core/bpf-lsm.h | 28 + src/core/bpf-socket-bind.c | 244 + src/core/bpf-socket-bind.h | 15 + src/core/bpf-util.c | 36 + src/core/bpf-util.h | 5 + src/core/bpf/restrict_fs/meson.build | 24 + src/core/bpf/restrict_fs/restrict-fs-skel.h | 14 + src/core/bpf/restrict_fs/restrict-fs.bpf.c | 82 + src/core/bpf/restrict_ifaces/meson.build | 24 + .../bpf/restrict_ifaces/restrict-ifaces-skel.h | 14 + src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c | 52 + src/core/bpf/socket_bind/meson.build | 24 + src/core/bpf/socket_bind/socket-bind-api.bpf.h | 51 + src/core/bpf/socket_bind/socket-bind-skel.h | 14 + src/core/bpf/socket_bind/socket-bind.bpf.c | 111 + src/core/cgroup.c | 4665 ++++++++++++++ src/core/cgroup.h | 429 ++ src/core/core-varlink.c | 652 ++ src/core/core-varlink.h | 16 + src/core/crash-handler.c | 193 + src/core/crash-handler.h | 7 + src/core/dbus-automount.c | 68 + src/core/dbus-automount.h | 11 + src/core/dbus-cgroup.c | 2287 +++++++ src/core/dbus-cgroup.h | 15 + src/core/dbus-device.c | 11 + src/core/dbus-device.h | 6 + src/core/dbus-execute.c | 3758 +++++++++++ src/core/dbus-execute.h | 35 + src/core/dbus-job.c | 374 ++ src/core/dbus-job.h | 20 + src/core/dbus-kill.c | 81 + src/core/dbus-kill.h | 12 + src/core/dbus-manager.c | 3628 +++++++++++ src/core/dbus-manager.h | 18 + src/core/dbus-mount.c | 174 + src/core/dbus-mount.h | 12 + src/core/dbus-path.c | 164 + src/core/dbus-path.h | 11 + src/core/dbus-scope.c | 318 + src/core/dbus-scope.h | 19 + src/core/dbus-service.c | 791 +++ src/core/dbus-service.h | 15 + src/core/dbus-slice.c | 34 + src/core/dbus-slice.h | 12 + src/core/dbus-socket.c | 470 ++ src/core/dbus-socket.h | 12 + src/core/dbus-swap.c | 55 + src/core/dbus-swap.h | 16 + src/core/dbus-target.c | 9 + src/core/dbus-target.h | 6 + src/core/dbus-timer.c | 364 ++ src/core/dbus-timer.h | 11 + src/core/dbus-unit.c | 2629 ++++++++ src/core/dbus-unit.h | 55 + src/core/dbus-util.c | 286 + src/core/dbus-util.h | 256 + src/core/dbus.c | 1273 ++++ src/core/dbus.h | 37 + src/core/device.c | 1301 ++++ src/core/device.h | 44 + src/core/dynamic-user.c | 871 +++ src/core/dynamic-user.h | 49 + src/core/efi-random.c | 34 + src/core/efi-random.h | 4 + src/core/emergency-action.c | 224 + src/core/emergency-action.h | 45 + src/core/exec-credential.c | 1023 +++ src/core/exec-credential.h | 54 + src/core/exec-invoke.c | 5235 +++++++++++++++ src/core/exec-invoke.h | 16 + src/core/execute-serialize.c | 3896 +++++++++++ src/core/execute-serialize.h | 23 + src/core/execute.c | 2742 ++++++++ src/core/execute.h | 701 ++ src/core/executor.c | 272 + src/core/fuzz-execute-serialize.c | 89 + src/core/fuzz-manager-serialize.c | 36 + src/core/fuzz-manager-serialize.options | 2 + src/core/fuzz-unit-file.c | 86 + src/core/fuzz-unit-file.options | 2 + src/core/generator-setup.c | 58 + src/core/generator-setup.h | 8 + src/core/ima-setup.c | 92 + src/core/ima-setup.h | 9 + src/core/import-creds.c | 938 +++ src/core/import-creds.h | 4 + src/core/job.c | 1712 +++++ src/core/job.h | 250 + src/core/kill.c | 56 + src/core/kill.h | 56 + src/core/kmod-setup.c | 201 + src/core/kmod-setup.h | 4 + src/core/load-dropin.c | 130 + src/core/load-dropin.h | 20 + src/core/load-fragment-gperf-nulstr.awk | 16 + src/core/load-fragment-gperf.gperf.in | 595 ++ src/core/load-fragment.c | 6735 ++++++++++++++++++++ src/core/load-fragment.h | 165 + src/core/main.c | 3227 ++++++++++ src/core/main.h | 9 + src/core/manager-dump.c | 119 + src/core/manager-dump.h | 13 + src/core/manager-serialize.c | 539 ++ src/core/manager-serialize.h | 13 + src/core/manager.c | 5039 +++++++++++++++ src/core/manager.h | 646 ++ src/core/meson.build | 260 + src/core/mount.c | 2502 ++++++++ src/core/mount.h | 110 + src/core/namespace.c | 3047 +++++++++ src/core/namespace.h | 200 + src/core/org.freedesktop.systemd1.conf | 452 ++ src/core/org.freedesktop.systemd1.policy.in | 83 + src/core/org.freedesktop.systemd1.service | 13 + src/core/path.c | 1075 ++++ src/core/path.h | 89 + src/core/restrict-ifaces.c | 200 + src/core/restrict-ifaces.h | 16 + src/core/scope.c | 829 +++ src/core/scope.h | 52 + src/core/selinux-access.c | 288 + src/core/selinux-access.h | 14 + src/core/selinux-setup.c | 106 + src/core/selinux-setup.h | 6 + src/core/service.c | 5161 +++++++++++++++ src/core/service.h | 290 + src/core/show-status.c | 128 + src/core/show-status.h | 44 + src/core/slice.c | 462 ++ src/core/slice.h | 18 + src/core/smack-setup.c | 393 ++ src/core/smack-setup.h | 10 + src/core/socket.c | 3617 +++++++++++ src/core/socket.h | 204 + src/core/swap.c | 1680 +++++ src/core/swap.h | 103 + src/core/system.conf.in | 83 + src/core/systemd.pc.in | 108 + src/core/target.c | 216 + src/core/target.h | 16 + src/core/timer.c | 1106 ++++ src/core/timer.h | 91 + src/core/transaction.c | 1261 ++++ src/core/transaction.h | 51 + src/core/unit-dependency-atom.c | 251 + src/core/unit-dependency-atom.h | 92 + src/core/unit-printf.c | 265 + src/core/unit-printf.h | 26 + src/core/unit-serialize.c | 890 +++ src/core/unit-serialize.h | 16 + src/core/unit.c | 6617 +++++++++++++++++++ src/core/unit.h | 1249 ++++ src/core/user.conf.in | 59 + 168 files changed, 98698 insertions(+) create mode 100644 src/core/all-units.h create mode 100644 src/core/apparmor-setup.c create mode 100644 src/core/apparmor-setup.h create mode 100644 src/core/audit-fd.c create mode 100644 src/core/audit-fd.h create mode 100644 src/core/automount.c create mode 100644 src/core/automount.h create mode 100644 src/core/bpf-devices.c create mode 100644 src/core/bpf-devices.h create mode 100644 src/core/bpf-firewall.c create mode 100644 src/core/bpf-firewall.h create mode 100644 src/core/bpf-foreign.c create mode 100644 src/core/bpf-foreign.h create mode 100644 src/core/bpf-lsm.c create mode 100644 src/core/bpf-lsm.h create mode 100644 src/core/bpf-socket-bind.c create mode 100644 src/core/bpf-socket-bind.h create mode 100644 src/core/bpf-util.c create mode 100644 src/core/bpf-util.h create mode 100644 src/core/bpf/restrict_fs/meson.build create mode 100644 src/core/bpf/restrict_fs/restrict-fs-skel.h create mode 100644 src/core/bpf/restrict_fs/restrict-fs.bpf.c create mode 100644 src/core/bpf/restrict_ifaces/meson.build create mode 100644 src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h create mode 100644 src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c create mode 100644 src/core/bpf/socket_bind/meson.build create mode 100644 src/core/bpf/socket_bind/socket-bind-api.bpf.h create mode 100644 src/core/bpf/socket_bind/socket-bind-skel.h create mode 100644 src/core/bpf/socket_bind/socket-bind.bpf.c create mode 100644 src/core/cgroup.c create mode 100644 src/core/cgroup.h create mode 100644 src/core/core-varlink.c create mode 100644 src/core/core-varlink.h create mode 100644 src/core/crash-handler.c create mode 100644 src/core/crash-handler.h create mode 100644 src/core/dbus-automount.c create mode 100644 src/core/dbus-automount.h create mode 100644 src/core/dbus-cgroup.c create mode 100644 src/core/dbus-cgroup.h create mode 100644 src/core/dbus-device.c create mode 100644 src/core/dbus-device.h create mode 100644 src/core/dbus-execute.c create mode 100644 src/core/dbus-execute.h create mode 100644 src/core/dbus-job.c create mode 100644 src/core/dbus-job.h create mode 100644 src/core/dbus-kill.c create mode 100644 src/core/dbus-kill.h create mode 100644 src/core/dbus-manager.c create mode 100644 src/core/dbus-manager.h create mode 100644 src/core/dbus-mount.c create mode 100644 src/core/dbus-mount.h create mode 100644 src/core/dbus-path.c create mode 100644 src/core/dbus-path.h create mode 100644 src/core/dbus-scope.c create mode 100644 src/core/dbus-scope.h create mode 100644 src/core/dbus-service.c create mode 100644 src/core/dbus-service.h create mode 100644 src/core/dbus-slice.c create mode 100644 src/core/dbus-slice.h create mode 100644 src/core/dbus-socket.c create mode 100644 src/core/dbus-socket.h create mode 100644 src/core/dbus-swap.c create mode 100644 src/core/dbus-swap.h create mode 100644 src/core/dbus-target.c create mode 100644 src/core/dbus-target.h create mode 100644 src/core/dbus-timer.c create mode 100644 src/core/dbus-timer.h create mode 100644 src/core/dbus-unit.c create mode 100644 src/core/dbus-unit.h create mode 100644 src/core/dbus-util.c create mode 100644 src/core/dbus-util.h create mode 100644 src/core/dbus.c create mode 100644 src/core/dbus.h create mode 100644 src/core/device.c create mode 100644 src/core/device.h create mode 100644 src/core/dynamic-user.c create mode 100644 src/core/dynamic-user.h create mode 100644 src/core/efi-random.c create mode 100644 src/core/efi-random.h create mode 100644 src/core/emergency-action.c create mode 100644 src/core/emergency-action.h create mode 100644 src/core/exec-credential.c create mode 100644 src/core/exec-credential.h create mode 100644 src/core/exec-invoke.c create mode 100644 src/core/exec-invoke.h create mode 100644 src/core/execute-serialize.c create mode 100644 src/core/execute-serialize.h create mode 100644 src/core/execute.c create mode 100644 src/core/execute.h create mode 100644 src/core/executor.c create mode 100644 src/core/fuzz-execute-serialize.c create mode 100644 src/core/fuzz-manager-serialize.c create mode 100644 src/core/fuzz-manager-serialize.options create mode 100644 src/core/fuzz-unit-file.c create mode 100644 src/core/fuzz-unit-file.options create mode 100644 src/core/generator-setup.c create mode 100644 src/core/generator-setup.h create mode 100644 src/core/ima-setup.c create mode 100644 src/core/ima-setup.h create mode 100644 src/core/import-creds.c create mode 100644 src/core/import-creds.h create mode 100644 src/core/job.c create mode 100644 src/core/job.h create mode 100644 src/core/kill.c create mode 100644 src/core/kill.h create mode 100644 src/core/kmod-setup.c create mode 100644 src/core/kmod-setup.h create mode 100644 src/core/load-dropin.c create mode 100644 src/core/load-dropin.h create mode 100644 src/core/load-fragment-gperf-nulstr.awk create mode 100644 src/core/load-fragment-gperf.gperf.in create mode 100644 src/core/load-fragment.c create mode 100644 src/core/load-fragment.h create mode 100644 src/core/main.c create mode 100644 src/core/main.h create mode 100644 src/core/manager-dump.c create mode 100644 src/core/manager-dump.h create mode 100644 src/core/manager-serialize.c create mode 100644 src/core/manager-serialize.h create mode 100644 src/core/manager.c create mode 100644 src/core/manager.h create mode 100644 src/core/meson.build create mode 100644 src/core/mount.c create mode 100644 src/core/mount.h create mode 100644 src/core/namespace.c create mode 100644 src/core/namespace.h create mode 100644 src/core/org.freedesktop.systemd1.conf create mode 100644 src/core/org.freedesktop.systemd1.policy.in create mode 100644 src/core/org.freedesktop.systemd1.service create mode 100644 src/core/path.c create mode 100644 src/core/path.h create mode 100644 src/core/restrict-ifaces.c create mode 100644 src/core/restrict-ifaces.h create mode 100644 src/core/scope.c create mode 100644 src/core/scope.h create mode 100644 src/core/selinux-access.c create mode 100644 src/core/selinux-access.h create mode 100644 src/core/selinux-setup.c create mode 100644 src/core/selinux-setup.h create mode 100644 src/core/service.c create mode 100644 src/core/service.h create mode 100644 src/core/show-status.c create mode 100644 src/core/show-status.h create mode 100644 src/core/slice.c create mode 100644 src/core/slice.h create mode 100644 src/core/smack-setup.c create mode 100644 src/core/smack-setup.h create mode 100644 src/core/socket.c create mode 100644 src/core/socket.h create mode 100644 src/core/swap.c create mode 100644 src/core/swap.h create mode 100644 src/core/system.conf.in create mode 100644 src/core/systemd.pc.in create mode 100644 src/core/target.c create mode 100644 src/core/target.h create mode 100644 src/core/timer.c create mode 100644 src/core/timer.h create mode 100644 src/core/transaction.c create mode 100644 src/core/transaction.h create mode 100644 src/core/unit-dependency-atom.c create mode 100644 src/core/unit-dependency-atom.h create mode 100644 src/core/unit-printf.c create mode 100644 src/core/unit-printf.h create mode 100644 src/core/unit-serialize.c create mode 100644 src/core/unit-serialize.h create mode 100644 src/core/unit.c create mode 100644 src/core/unit.h create mode 100644 src/core/user.conf.in (limited to 'src/core') diff --git a/src/core/all-units.h b/src/core/all-units.h new file mode 100644 index 0000000..fad814b --- /dev/null +++ b/src/core/all-units.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +#include "automount.h" +#include "device.h" +#include "path.h" +#include "scope.h" +#include "service.h" +#include "slice.h" +#include "socket.h" +#include "swap.h" +#include "target.h" +#include "timer.h" diff --git a/src/core/apparmor-setup.c b/src/core/apparmor-setup.c new file mode 100644 index 0000000..3426a10 --- /dev/null +++ b/src/core/apparmor-setup.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#if HAVE_APPARMOR +# include +#endif +#include + +#include "apparmor-setup.h" +#include "apparmor-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" + +#if HAVE_APPARMOR +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_policy_cache *, aa_policy_cache_unref, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_features *, aa_features_unref, NULL); +#endif + +int mac_apparmor_setup(void) { +#if HAVE_APPARMOR + _cleanup_(aa_policy_cache_unrefp) aa_policy_cache *policy_cache = NULL; + _cleanup_(aa_features_unrefp) aa_features *features = NULL; + _cleanup_free_ char *current_profile = NULL, *cache_dir_path = NULL; + int r; + + if (!mac_apparmor_use()) { + log_debug("AppArmor either not supported by the kernel or disabled."); + return 0; + } + + /* To enable LSM stacking a patch to the kernel is proposed to create a + * per-LSM subdirectory to distinguish between the LSMs. Therefore, we + * read the file from the LSM specific directory first and only if that + * fails the one from the generic directory. + */ + FOREACH_STRING(current_file, "/proc/self/attr/apparmor/current", "/proc/self/attr/current") { + r = read_one_line_file(current_file, ¤t_profile); + if (r == -ENOENT) + continue; + else if (r < 0) + log_warning_errno(r, "Failed to read current AppArmor profile from file %s, ignoring: %m", current_file); + else + break; + } + if (!current_profile) { + log_warning("Failed to get the current AppArmor profile of systemd from /proc/self/attr/apparmor/current or /proc/self/attr/current, ignoring."); + return 0; + } + if (!streq(current_profile, "unconfined")) { + log_debug("We are already confined in an AppArmor profile."); + return 0; + } + + r = aa_features_new_from_kernel(&features); + if (r < 0) { + log_warning_errno(errno, "Failed to get the AppArmor feature set from the kernel, ignoring: %m"); + return 0; + } + cache_dir_path = aa_policy_cache_dir_path_preview(features, AT_FDCWD, "/etc/apparmor/earlypolicy"); + if (!cache_dir_path) { + log_debug_errno(errno, "Failed to get the path of the early AppArmor policy cache directory."); + return 0; + } + + /* aa_policy_cache_new will internally use the same path as aa_policy_cache_dir_path_preview has returned. */ + r = aa_policy_cache_new(&policy_cache, features, AT_FDCWD, "/etc/apparmor/earlypolicy", 0); + if (r < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "The early AppArmor policy cache directory %s does not exist.", cache_dir_path); + return 0; + } + log_warning_errno(errno, "Failed to create a new AppArmor policy cache, ignoring: %m"); + return 0; + } + r = aa_policy_cache_replace_all(policy_cache, NULL); + if (r < 0) { + log_warning_errno(errno, "Failed to load the profiles from the early AppArmor policy cache directory %s, ignoring: %m", cache_dir_path); + return 0; + } + + log_info("Successfully loaded all binary profiles from AppArmor early policy cache at %s.", cache_dir_path); + + r = aa_change_profile("systemd"); + if (r < 0) { + if (errno == ENOENT) + log_debug_errno(errno, "Failed to change to AppArmor profile 'systemd'. Please ensure that one of the binary profile files in policy cache directory %s contains a profile with that name.", cache_dir_path); + else + log_error_errno(errno, "Failed to change to AppArmor profile 'systemd': %m"); + return 0; + } + + log_info("Changed to AppArmor profile systemd."); +#endif + return 0; +} diff --git a/src/core/apparmor-setup.h b/src/core/apparmor-setup.h new file mode 100644 index 0000000..f3b7382 --- /dev/null +++ b/src/core/apparmor-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int mac_apparmor_setup(void); diff --git a/src/core/audit-fd.c b/src/core/audit-fd.c new file mode 100644 index 0000000..6674fa8 --- /dev/null +++ b/src/core/audit-fd.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "audit-fd.h" + +#if HAVE_AUDIT + +#include +#include + +#include "capability-util.h" +#include "fd-util.h" +#include "log.h" + +static bool initialized = false; +static int audit_fd; + +int get_audit_fd(void) { + + if (!initialized) { + if (have_effective_cap(CAP_AUDIT_WRITE) <= 0) { + audit_fd = -EPERM; + initialized = true; + + return audit_fd; + } + + audit_fd = audit_open(); + + if (audit_fd < 0) { + if (!IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT)) + log_error_errno(errno, "Failed to connect to audit log: %m"); + + audit_fd = errno ? -errno : -EINVAL; + } + + initialized = true; + } + + return audit_fd; +} + +void close_audit_fd(void) { + + if (initialized && audit_fd >= 0) + safe_close(audit_fd); + + initialized = true; + audit_fd = -ECONNRESET; +} + +#else + +int get_audit_fd(void) { + return -EAFNOSUPPORT; +} + +void close_audit_fd(void) { +} + +#endif diff --git a/src/core/audit-fd.h b/src/core/audit-fd.h new file mode 100644 index 0000000..5cdf61e --- /dev/null +++ b/src/core/audit-fd.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int get_audit_fd(void); +void close_audit_fd(void); diff --git a/src/core/automount.c b/src/core/automount.c new file mode 100644 index 0000000..14bf7e6 --- /dev/null +++ b/src/core/automount.c @@ -0,0 +1,1149 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "async.h" +#include "automount.h" +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-unit.h" +#include "fd-util.h" +#include "format-util.h" +#include "fstab-util.h" +#include "io-util.h" +#include "label-util.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = { + [AUTOMOUNT_DEAD] = UNIT_INACTIVE, + [AUTOMOUNT_WAITING] = UNIT_ACTIVE, + [AUTOMOUNT_RUNNING] = UNIT_ACTIVE, + [AUTOMOUNT_FAILED] = UNIT_FAILED +}; + +static int open_dev_autofs(Manager *m); +static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata); +static int automount_start_expire(Automount *a); +static void automount_stop_expire(Automount *a); +static int automount_send_ready(Automount *a, Set *tokens, int status); + +static void automount_init(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(u); + assert(u->load_state == UNIT_STUB); + + a->pipe_fd = -EBADF; + a->directory_mode = 0755; + UNIT(a)->ignore_on_isolate = true; +} + +static void unmount_autofs(Automount *a) { + int r; + + assert(a); + + if (a->pipe_fd < 0) + return; + + a->pipe_event_source = sd_event_source_disable_unref(a->pipe_event_source); + a->pipe_fd = safe_close(a->pipe_fd); + + /* If we reload/reexecute things we keep the mount point around */ + if (!IN_SET(UNIT(a)->manager->objective, MANAGER_RELOAD, MANAGER_REEXECUTE)) { + + automount_send_ready(a, a->tokens, -EHOSTDOWN); + automount_send_ready(a, a->expire_tokens, -EHOSTDOWN); + + if (a->where) { + r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + log_unit_error_errno(UNIT(a), r, "Failed to unmount: %m"); + } + } +} + +static void automount_done(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + unmount_autofs(a); + + a->where = mfree(a->where); + a->extra_options = mfree(a->extra_options); + + a->tokens = set_free(a->tokens); + a->expire_tokens = set_free(a->expire_tokens); + + a->expire_event_source = sd_event_source_disable_unref(a->expire_event_source); +} + +static int automount_add_trigger_dependencies(Automount *a) { + Unit *x; + int r; + + assert(a); + + r = unit_load_related_unit(UNIT(a), ".mount", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(a), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int automount_add_mount_dependencies(Automount *a) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(a); + + r = path_extract_directory(a->where, &parent); + if (r < 0) + return r; + + return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT); +} + +static int automount_add_default_dependencies(Automount *a) { + int r; + + assert(a); + + if (!UNIT(a)->default_dependencies) + return 0; + + if (!MANAGER_IS_SYSTEM(UNIT(a)->manager)) + return 0; + + r = unit_add_dependency_by_name(UNIT(a), UNIT_BEFORE, SPECIAL_LOCAL_FS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + r = unit_add_dependency_by_name(UNIT(a), UNIT_AFTER, SPECIAL_LOCAL_FS_PRE_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(UNIT(a), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int automount_verify(Automount *a) { + static const char *const reserved_options[] = { + "fd\0", + "pgrp\0", + "minproto\0", + "maxproto\0", + "direct\0", + "indirect\0", + }; + + _cleanup_free_ char *e = NULL; + int r; + + assert(a); + assert(UNIT(a)->load_state == UNIT_LOADED); + + if (path_equal(a->where, "/")) + return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Cannot have an automount unit for the root directory. Refusing."); + + r = unit_name_from_path(a->where, ".automount", &e); + if (r < 0) + return log_unit_error_errno(UNIT(a), r, "Failed to generate unit name from path: %m"); + + if (!unit_has_name(UNIT(a), e)) + return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing."); + + for (size_t i = 0; i < ELEMENTSOF(reserved_options); i++) + if (fstab_test_option(a->extra_options, reserved_options[i])) + return log_unit_error_errno( + UNIT(a), + SYNTHETIC_ERRNO(ENOEXEC), + "ExtraOptions= setting may not contain reserved option %s.", + reserved_options[i]); + + return 0; +} + +static int automount_set_where(Automount *a) { + int r; + + assert(a); + + if (a->where) + return 0; + + r = unit_name_to_path(UNIT(a)->id, &a->where); + if (r < 0) + return r; + + path_simplify(a->where); + return 1; +} + +static int automount_add_extras(Automount *a) { + int r; + + r = automount_set_where(a); + if (r < 0) + return r; + + r = automount_add_trigger_dependencies(a); + if (r < 0) + return r; + + r = automount_add_mount_dependencies(a); + if (r < 0) + return r; + + return automount_add_default_dependencies(a); +} + +static int automount_load(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + /* Load a .automount file */ + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + r = automount_add_extras(a); + if (r < 0) + return r; + + return automount_verify(a); +} + +static void automount_set_state(Automount *a, AutomountState state) { + AutomountState old_state; + assert(a); + + if (a->state != state) + bus_unit_send_pending_change_signal(UNIT(a), false); + + old_state = a->state; + a->state = state; + + if (state != AUTOMOUNT_RUNNING) + automount_stop_expire(a); + + if (!IN_SET(state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) + unmount_autofs(a); + + if (state != old_state) + log_unit_debug(UNIT(a), "Changed %s -> %s", automount_state_to_string(old_state), automount_state_to_string(state)); + + unit_notify(UNIT(a), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static int automount_coldplug(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(a->state == AUTOMOUNT_DEAD); + + if (a->deserialized_state == a->state) + return 0; + + if (IN_SET(a->deserialized_state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) { + + r = automount_set_where(a); + if (r < 0) + return r; + + r = open_dev_autofs(u->manager); + if (r < 0) + return r; + + assert(a->pipe_fd >= 0); + + r = sd_event_add_io(u->manager->event, &a->pipe_event_source, a->pipe_fd, EPOLLIN, automount_dispatch_io, u); + if (r < 0) + return r; + + (void) sd_event_source_set_description(a->pipe_event_source, "automount-io"); + if (a->deserialized_state == AUTOMOUNT_RUNNING) { + r = automount_start_expire(a); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m"); + } + + automount_set_state(a, a->deserialized_state); + } + + return 0; +} + +static void automount_dump(Unit *u, FILE *f, const char *prefix) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + fprintf(f, + "%sAutomount State: %s\n" + "%sResult: %s\n" + "%sWhere: %s\n" + "%sExtraOptions: %s\n" + "%sDirectoryMode: %04o\n" + "%sTimeoutIdleUSec: %s\n", + prefix, automount_state_to_string(a->state), + prefix, automount_result_to_string(a->result), + prefix, a->where, + prefix, a->extra_options, + prefix, a->directory_mode, + prefix, FORMAT_TIMESPAN(a->timeout_idle_usec, USEC_PER_SEC)); +} + +static void automount_enter_dead(Automount *a, AutomountResult f) { + assert(a); + + if (a->result == AUTOMOUNT_SUCCESS) + a->result = f; + + unit_log_result(UNIT(a), a->result == AUTOMOUNT_SUCCESS, automount_result_to_string(a->result)); + automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD); +} + +static int open_dev_autofs(Manager *m) { + struct autofs_dev_ioctl param; + int r; + + assert(m); + + if (m->dev_autofs_fd >= 0) + return m->dev_autofs_fd; + + (void) label_fix("/dev/autofs", 0); + + m->dev_autofs_fd = open("/dev/autofs", O_CLOEXEC|O_RDONLY); + if (m->dev_autofs_fd < 0) + return log_error_errno(errno, "Failed to open /dev/autofs: %m"); + + init_autofs_dev_ioctl(¶m); + r = RET_NERRNO(ioctl(m->dev_autofs_fd, AUTOFS_DEV_IOCTL_VERSION, ¶m)); + if (r < 0) { + m->dev_autofs_fd = safe_close(m->dev_autofs_fd); + return log_error_errno(r, "Failed to issue AUTOFS_DEV_IOCTL_VERSION ioctl: %m"); + } + + log_debug("Autofs kernel version %u.%u", param.ver_major, param.ver_minor); + + return m->dev_autofs_fd; +} + +static int open_ioctl_fd(int dev_autofs_fd, const char *where, dev_t devid) { + struct autofs_dev_ioctl *param; + size_t l; + + assert(dev_autofs_fd >= 0); + assert(where); + + l = sizeof(struct autofs_dev_ioctl) + strlen(where) + 1; + param = alloca_safe(l); + + init_autofs_dev_ioctl(param); + param->size = l; + param->ioctlfd = -EBADF; + param->openmount.devid = devid; + strcpy(param->path, where); + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0) + return -errno; + + if (param->ioctlfd < 0) + return -EIO; + + (void) fd_cloexec(param->ioctlfd, true); + return param->ioctlfd; +} + +static int autofs_protocol(int dev_autofs_fd, int ioctl_fd) { + uint32_t major, minor; + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOVER, ¶m) < 0) + return -errno; + + major = param.protover.version; + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOSUBVER, ¶m) < 0) + return -errno; + + minor = param.protosubver.sub_version; + + log_debug("Autofs protocol version %u.%u", major, minor); + return 0; +} + +static int autofs_set_timeout(int dev_autofs_fd, int ioctl_fd, usec_t usec) { + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (usec == USEC_INFINITY) + param.timeout.timeout = 0; + else + /* Convert to seconds, rounding up. */ + param.timeout.timeout = DIV_ROUND_UP(usec, USEC_PER_SEC); + + return RET_NERRNO(ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_TIMEOUT, ¶m)); +} + +static int autofs_send_ready(int dev_autofs_fd, int ioctl_fd, uint32_t token, int status) { + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (status != 0) { + param.fail.token = token; + param.fail.status = status; + } else + param.ready.token = token; + + return RET_NERRNO(ioctl(dev_autofs_fd, status ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, ¶m)); +} + +static int automount_send_ready(Automount *a, Set *tokens, int status) { + _cleanup_close_ int ioctl_fd = -EBADF; + unsigned token; + int r; + + assert(a); + assert(status <= 0); + + if (set_isempty(tokens)) + return 0; + + ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (ioctl_fd < 0) + return ioctl_fd; + + if (status != 0) + log_unit_debug_errno(UNIT(a), status, "Sending failure: %m"); + else + log_unit_debug(UNIT(a), "Sending success."); + + r = 0; + + /* Autofs thankfully does not hand out 0 as a token */ + while ((token = PTR_TO_UINT(set_steal_first(tokens)))) { + int k; + + /* Autofs fun fact: + * + * if you pass a positive status code here, kernels + * prior to 4.12 will freeze! Yay! */ + + k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd, + ioctl_fd, + token, + status); + if (k < 0) + r = k; + } + + return r; +} + +static void automount_trigger_notify(Unit *u, Unit *other) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(other); + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + assert(other->type == UNIT_MOUNT); + + /* Don't propagate state changes from the mount if we are already down */ + if (!IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + automount_enter_dead(a, AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + /* The mount is successfully established */ + if (IN_SET(MOUNT(other)->state, MOUNT_MOUNTED, MOUNT_REMOUNTING)) { + (void) automount_send_ready(a, a->tokens, 0); + + r = automount_start_expire(a); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m"); + + automount_set_state(a, AUTOMOUNT_RUNNING); + } + + if (IN_SET(MOUNT(other)->state, + MOUNT_MOUNTING, MOUNT_MOUNTING_DONE, + MOUNT_MOUNTED, MOUNT_REMOUNTING, + MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL, + MOUNT_FAILED)) + (void) automount_send_ready(a, a->expire_tokens, -ENODEV); + + if (MOUNT(other)->state == MOUNT_DEAD) + (void) automount_send_ready(a, a->expire_tokens, 0); + + /* The mount is in some unhappy state now, let's unfreeze any waiting clients */ + if (IN_SET(MOUNT(other)->state, + MOUNT_DEAD, MOUNT_UNMOUNTING, + MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL, + MOUNT_FAILED)) { + + (void) automount_send_ready(a, a->tokens, -ENODEV); + + automount_set_state(a, AUTOMOUNT_WAITING); + } +} + +static void automount_enter_waiting(Automount *a) { + _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR; + _cleanup_close_ int ioctl_fd = -EBADF; + char name[STRLEN("systemd-") + DECIMAL_STR_MAX(pid_t) + 1]; + _cleanup_free_ char *options = NULL; + bool mounted = false; + int r, dev_autofs_fd; + struct stat st; + + assert(a); + assert(a->pipe_fd < 0); + assert(a->where); + + set_clear(a->tokens); + + r = unit_fail_if_noncanonical(UNIT(a), a->where); + if (r < 0) + goto fail; + + (void) mkdir_p_label(a->where, a->directory_mode); + + unit_warn_if_dir_nonempty(UNIT(a), a->where); + + dev_autofs_fd = open_dev_autofs(UNIT(a)->manager); + if (dev_autofs_fd < 0) + goto fail; + + if (pipe2(pipe_fd, O_CLOEXEC) < 0) { + log_unit_warning_errno(UNIT(a), errno, "Failed to allocate autofs pipe: %m"); + goto fail; + } + r = fd_nonblock(pipe_fd[0], true); + if (r < 0) { + log_unit_warning_errno(UNIT(a), r, "Failed to make read side of pipe non-blocking: %m"); + goto fail; + } + + if (asprintf( + &options, + "fd=%i,pgrp="PID_FMT",minproto=5,maxproto=5,direct%s%s", + pipe_fd[1], + getpgrp(), + isempty(a->extra_options) ? "" : ",", + strempty(a->extra_options)) < 0) { + log_oom(); + goto fail; + } + + xsprintf(name, "systemd-"PID_FMT, getpid_cached()); + r = mount_nofollow_verbose(LOG_WARNING, name, a->where, "autofs", 0, options); + if (r < 0) + goto fail; + + mounted = true; + + pipe_fd[1] = safe_close(pipe_fd[1]); + + if (stat(a->where, &st) < 0) { + log_unit_warning_errno(UNIT(a), errno, "Failed to stat new automount point '%s': %m", a->where); + goto fail; + } + + ioctl_fd = open_ioctl_fd(dev_autofs_fd, a->where, st.st_dev); + if (ioctl_fd < 0) { + log_unit_warning_errno(UNIT(a), ioctl_fd, "Failed to open automount ioctl fd for '%s': %m", a->where); + goto fail; + } + + r = autofs_protocol(dev_autofs_fd, ioctl_fd); + if (r < 0) { + log_unit_warning_errno(UNIT(a), r, "Failed to validate autofs protocol for '%s': %m", a->where); + goto fail; + } + + r = autofs_set_timeout(dev_autofs_fd, ioctl_fd, a->timeout_idle_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(a), r, "Failed to set autofs timeout for '%s': %m", a->where); + goto fail; + } + + r = sd_event_add_io(UNIT(a)->manager->event, &a->pipe_event_source, pipe_fd[0], EPOLLIN, automount_dispatch_io, a); + if (r < 0) { + log_unit_warning_errno(UNIT(a), r, "Failed to allocate IO event source for autofs mount '%s': %m", a->where); + goto fail; + } + + (void) sd_event_source_set_description(a->pipe_event_source, "automount-io"); + + a->pipe_fd = TAKE_FD(pipe_fd[0]); + a->dev_id = st.st_dev; + + automount_set_state(a, AUTOMOUNT_WAITING); + return; + +fail: + if (mounted) { + r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to unmount, ignoring: %m"); + } + + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); +} + +static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) { + int r; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + /* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the + * child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up + * anyway. */ + + r = safe_fork_full("(sd-expire)", + /* stdio_fds= */ NULL, + (int[]) { dev_autofs_fd, ioctl_fd }, + /* n_except_fds= */ 2, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG, + /* pid= */ NULL); + if (r != 0) + return r; + + /* Child */ + for (;;) { + struct autofs_dev_ioctl param; + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m) < 0) + break; + } + + if (errno != EAGAIN) + log_warning_errno(errno, "Failed to expire automount, ignoring: %m"); + + _exit(EXIT_SUCCESS); +} + +static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) { + _cleanup_close_ int ioctl_fd = -EBADF; + Automount *a = AUTOMOUNT(userdata); + int r; + + assert(a); + assert(source == a->expire_event_source); + + ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (ioctl_fd < 0) + return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m"); + + r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd); + if (r < 0) + return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m"); + + return automount_start_expire(a); +} + +static int automount_start_expire(Automount *a) { + usec_t timeout; + int r; + + assert(a); + + if (a->timeout_idle_usec == 0) + return 0; + + timeout = MAX(a->timeout_idle_usec/3, USEC_PER_SEC); + + if (a->expire_event_source) { + r = sd_event_source_set_time_relative(a->expire_event_source, timeout); + if (r < 0) + return r; + + return sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_ONESHOT); + } + + r = sd_event_add_time_relative( + UNIT(a)->manager->event, + &a->expire_event_source, + CLOCK_MONOTONIC, timeout, 0, + automount_dispatch_expire, a); + if (r < 0) + return r; + + (void) sd_event_source_set_description(a->expire_event_source, "automount-expire"); + + return 0; +} + +static void automount_stop_expire(Automount *a) { + assert(a); + + if (!a->expire_event_source) + return; + + (void) sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_OFF); +} + +static void automount_enter_running(Automount *a) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + struct stat st; + int r; + + assert(a); + + /* If the user masked our unit in the meantime, fail */ + if (UNIT(a)->load_state != UNIT_LOADED) { + log_unit_error(UNIT(a), "Suppressing automount event since unit is no longer loaded."); + goto fail; + } + + /* We don't take mount requests anymore if we are supposed to + * shut down anyway */ + if (unit_stop_pending(UNIT(a))) { + log_unit_debug(UNIT(a), "Suppressing automount request since unit stop is scheduled."); + automount_send_ready(a, a->tokens, -EHOSTDOWN); + automount_send_ready(a, a->expire_tokens, -EHOSTDOWN); + return; + } + + (void) mkdir_p_label(a->where, a->directory_mode); + + /* Before we do anything, let's see if somebody is playing games with us? */ + if (lstat(a->where, &st) < 0) { + log_unit_warning_errno(UNIT(a), errno, "Failed to stat automount point: %m"); + goto fail; + } + + /* The mount unit may have been explicitly started before we got the + * autofs request. Ack it to unblock anything waiting on the mount point. */ + if (!S_ISDIR(st.st_mode) || st.st_dev != a->dev_id) { + log_unit_info(UNIT(a), "Automount point already active?"); + automount_send_ready(a, a->tokens, 0); + return; + } + + trigger = UNIT_TRIGGER(UNIT(a)); + if (!trigger) { + log_unit_error(UNIT(a), "Unit to trigger vanished."); + goto fail; + } + + r = manager_add_job(UNIT(a)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(a), "Failed to queue mount startup job: %s", bus_error_message(&error, r)); + goto fail; + } + + automount_set_state(a, AUTOMOUNT_RUNNING); + return; + +fail: + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); +} + +static int automount_start(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED)); + + if (path_is_mount_point(a->where, NULL, 0) > 0) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), "Path %s is already a mount point, refusing start.", a->where); + + r = unit_test_trigger_loaded(u); + if (r < 0) + return r; + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + a->result = AUTOMOUNT_SUCCESS; + automount_enter_waiting(a); + return 1; +} + +static int automount_stop(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)); + + automount_enter_dead(a, AUTOMOUNT_SUCCESS); + return 1; +} + +static int automount_serialize(Unit *u, FILE *f, FDSet *fds) { + Automount *a = AUTOMOUNT(u); + void *p; + int r; + + assert(a); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", automount_state_to_string(a->state)); + (void) serialize_item(f, "result", automount_result_to_string(a->result)); + (void) serialize_item_format(f, "dev-id", "%lu", (unsigned long) a->dev_id); + + SET_FOREACH(p, a->tokens) + (void) serialize_item_format(f, "token", "%u", PTR_TO_UINT(p)); + SET_FOREACH(p, a->expire_tokens) + (void) serialize_item_format(f, "expire-token", "%u", PTR_TO_UINT(p)); + + r = serialize_fd(f, fds, "pipe-fd", a->pipe_fd); + if (r < 0) + return r; + + return 0; +} + +static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(fds); + + if (streq(key, "state")) { + AutomountState state; + + state = automount_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + a->deserialized_state = state; + } else if (streq(key, "result")) { + AutomountResult f; + + f = automount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != AUTOMOUNT_SUCCESS) + a->result = f; + + } else if (streq(key, "dev-id")) { + unsigned long d; + + if (safe_atolu(value, &d) < 0) + log_unit_debug(u, "Failed to parse dev-id value: %s", value); + else + a->dev_id = (dev_t) d; + + } else if (streq(key, "token")) { + unsigned token; + + if (safe_atou(value, &token) < 0) + log_unit_debug(u, "Failed to parse token value: %s", value); + else { + r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(token)); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add token to set: %m"); + } + } else if (streq(key, "expire-token")) { + unsigned token; + + if (safe_atou(value, &token) < 0) + log_unit_debug(u, "Failed to parse token value: %s", value); + else { + r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(token)); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add expire token to set: %m"); + } + } else if (streq(key, "pipe-fd")) { + safe_close(a->pipe_fd); + a->pipe_fd = deserialize_fd(fds, value); + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState automount_active_state(Unit *u) { + assert(u); + + return state_translation_table[AUTOMOUNT(u)->state]; +} + +static const char *automount_sub_state_to_string(Unit *u) { + assert(u); + + return automount_state_to_string(AUTOMOUNT(u)->state); +} + +static bool automount_may_gc(Unit *u) { + Unit *t; + + assert(u); + + t = UNIT_TRIGGER(u); + if (!t) + return true; + + return UNIT_VTABLE(t)->may_gc(t); +} + +static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + union autofs_v5_packet_union packet; + Automount *a = AUTOMOUNT(userdata); + Unit *trigger; + int r; + + assert(a); + assert(fd == a->pipe_fd); + + if (events & (EPOLLHUP|EPOLLERR)) { + log_unit_error(UNIT(a), "Got hangup/error on autofs pipe from kernel. Likely our automount point has been unmounted by someone or something else?"); + automount_enter_dead(a, AUTOMOUNT_FAILURE_UNMOUNTED); + return 0; + } + + if (events != EPOLLIN) { + log_unit_error(UNIT(a), "Got invalid poll event %"PRIu32" on pipe (fd=%d)", events, fd); + goto fail; + } + + r = loop_read_exact(a->pipe_fd, &packet, sizeof(packet), true); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Invalid read from pipe: %m"); + goto fail; + } + + switch (packet.hdr.type) { + + case autofs_ptype_missing_direct: + + if (packet.v5_packet.pid > 0) { + _cleanup_free_ char *p = NULL; + + (void) pid_get_comm(packet.v5_packet.pid, &p); + log_unit_info(UNIT(a), "Got automount request for %s, triggered by %"PRIu32" (%s)", a->where, packet.v5_packet.pid, strna(p)); + } else + log_unit_debug(UNIT(a), "Got direct mount request on %s", a->where); + + r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token)); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m"); + goto fail; + } + + automount_enter_running(a); + break; + + case autofs_ptype_expire_direct: + log_unit_debug(UNIT(a), "Got direct umount request on %s", a->where); + + automount_stop_expire(a); + + r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token)); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m"); + goto fail; + } + + trigger = UNIT_TRIGGER(UNIT(a)); + if (!trigger) { + log_unit_error(UNIT(a), "Unit to trigger vanished."); + goto fail; + } + + r = manager_add_job(UNIT(a)->manager, JOB_STOP, trigger, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(a), "Failed to queue unmount job: %s", bus_error_message(&error, r)); + goto fail; + } + break; + + default: + log_unit_error(UNIT(a), "Received unknown automount request %i", packet.hdr.type); + break; + } + + return 0; + +fail: + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); + return 0; +} + +static void automount_shutdown(Manager *m) { + assert(m); + + m->dev_autofs_fd = safe_close(m->dev_autofs_fd); +} + +static void automount_reset_failed(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + if (a->state == AUTOMOUNT_FAILED) + automount_set_state(a, AUTOMOUNT_DEAD); + + a->result = AUTOMOUNT_SUCCESS; +} + +static bool automount_supported(void) { + static int supported = -1; + + if (supported < 0) + supported = access("/dev/autofs", F_OK) >= 0; + + return supported; +} + +static int automount_can_start(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + + r = unit_test_start_limit(u); + if (r < 0) { + automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT); + return r; + } + + return 1; +} + +static const char* const automount_result_table[_AUTOMOUNT_RESULT_MAX] = { + [AUTOMOUNT_SUCCESS] = "success", + [AUTOMOUNT_FAILURE_RESOURCES] = "resources", + [AUTOMOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT] = "mount-start-limit-hit", + [AUTOMOUNT_FAILURE_UNMOUNTED] = "unmounted", +}; + +DEFINE_STRING_TABLE_LOOKUP(automount_result, AutomountResult); + +const UnitVTable automount_vtable = { + .object_size = sizeof(Automount), + + .sections = + "Unit\0" + "Automount\0" + "Install\0", + .private_section = "Automount", + + .can_transient = true, + .can_fail = true, + .can_trigger = true, + .exclude_from_switch_root_serialization = true, + + .init = automount_init, + .load = automount_load, + .done = automount_done, + + .coldplug = automount_coldplug, + + .dump = automount_dump, + + .start = automount_start, + .stop = automount_stop, + + .serialize = automount_serialize, + .deserialize_item = automount_deserialize_item, + + .active_state = automount_active_state, + .sub_state_to_string = automount_sub_state_to_string, + + .may_gc = automount_may_gc, + + .trigger_notify = automount_trigger_notify, + + .reset_failed = automount_reset_failed, + + .bus_set_property = bus_automount_set_property, + + .shutdown = automount_shutdown, + .supported = automount_supported, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Set up automount %s.", + [JOB_FAILED] = "Failed to set up automount %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Unset automount %s.", + [JOB_FAILED] = "Failed to unset automount %s.", + }, + }, + + .can_start = automount_can_start, +}; diff --git a/src/core/automount.h b/src/core/automount.h new file mode 100644 index 0000000..e413f23 --- /dev/null +++ b/src/core/automount.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Automount Automount; + +#include "unit.h" + +typedef enum AutomountResult { + AUTOMOUNT_SUCCESS, + AUTOMOUNT_FAILURE_RESOURCES, + AUTOMOUNT_FAILURE_UNMOUNTED, + AUTOMOUNT_FAILURE_START_LIMIT_HIT, + AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT, + _AUTOMOUNT_RESULT_MAX, + _AUTOMOUNT_RESULT_INVALID = -EINVAL, +} AutomountResult; + +struct Automount { + Unit meta; + + AutomountState state, deserialized_state; + + char *where; + char *extra_options; + usec_t timeout_idle_usec; + + int pipe_fd; + sd_event_source *pipe_event_source; + mode_t directory_mode; + dev_t dev_id; + + Set *tokens; + Set *expire_tokens; + + sd_event_source *expire_event_source; + + AutomountResult result; +}; + +extern const UnitVTable automount_vtable; + +const char* automount_result_to_string(AutomountResult i) _const_; +AutomountResult automount_result_from_string(const char *s) _pure_; + +DEFINE_CAST(AUTOMOUNT, Automount); diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c new file mode 100644 index 0000000..06d2146 --- /dev/null +++ b/src/core/bpf-devices.c @@ -0,0 +1,505 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "bpf-devices.h" +#include "bpf-program.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "stdio-util.h" +#include "string-util.h" + +#define PASS_JUMP_OFF 4096 + +/* Ensure the high level flags we use and the low-level BPF flags exposed on the kernel are defined the same way */ +assert_cc((unsigned) BPF_DEVCG_ACC_MKNOD == (unsigned) CGROUP_DEVICE_MKNOD); +assert_cc((unsigned) BPF_DEVCG_ACC_READ == (unsigned) CGROUP_DEVICE_READ); +assert_cc((unsigned) BPF_DEVCG_ACC_WRITE == (unsigned) CGROUP_DEVICE_WRITE); + +static int bpf_prog_allow_list_device( + BPFProgram *prog, + char type, + int major, + int minor, + CGroupDevicePermissions p) { + + int r; + + assert(prog); + + log_trace("%s: %c %d:%d %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p)); + + if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 4), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 3), /* compare device type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (p == _CGROUP_DEVICE_PERMISSIONS_ALL) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +static int bpf_prog_allow_list_major( + BPFProgram *prog, + char type, + int major, + CGroupDevicePermissions p) { + + int r; + + assert(prog); + + log_trace("%s: %c %d:* %s", __func__, type, major, cgroup_device_permissions_to_string(p)); + + if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 2), /* compare device type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (p == _CGROUP_DEVICE_PERMISSIONS_ALL) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +static int bpf_prog_allow_list_class( + BPFProgram *prog, + char type, + CGroupDevicePermissions p) { + + int r; + + assert(prog); + + log_trace("%s: %c *:* %s", __func__, type, cgroup_device_permissions_to_string(p)); + + if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 1), /* compare device type */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (p == _CGROUP_DEVICE_PERMISSIONS_ALL) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int bpf_devices_cgroup_init( + BPFProgram **ret, + CGroupDevicePolicy policy, + bool allow_list) { + + const struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + int r; + + assert(ret); + + if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list) + return 0; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog); + if (r < 0) + return log_error_errno(r, "Loading device control BPF program failed: %m"); + + if (policy == CGROUP_DEVICE_POLICY_CLOSED || allow_list) { + r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + } + + *ret = TAKE_PTR(prog); + + return 0; +} + +int bpf_devices_apply_policy( + BPFProgram **prog, + CGroupDevicePolicy policy, + bool allow_list, + const char *cgroup_path, + BPFProgram **prog_installed) { + + _cleanup_free_ char *controller_path = NULL; + int r; + + /* This will assign *prog_installed if everything goes well. */ + + assert(prog); + if (!*prog) + goto finish; + + const bool deny_everything = policy == CGROUP_DEVICE_POLICY_STRICT && !allow_list; + + const struct bpf_insn post_insn[] = { + /* return DENY */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + }; + + const struct bpf_insn exit_insn[] = { + /* finally return DENY if deny_everything else ALLOW */ + BPF_MOV64_IMM(BPF_REG_0, deny_everything ? 0 : 1), + BPF_EXIT_INSN() + }; + + if (!deny_everything) { + r = bpf_program_add_instructions(*prog, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + /* Fixup PASS_JUMP_OFF jump offsets. */ + for (size_t off = 0; off < (*prog)->n_instructions; off++) { + struct bpf_insn *ins = &((*prog)->instructions[off]); + + if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF) + ins->off = (*prog)->n_instructions - off - 1; + } + } + + r = bpf_program_add_instructions(*prog, exit_insn, ELEMENTSOF(exit_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, NULL, &controller_path); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup path: %m"); + + r = bpf_program_cgroup_attach(*prog, BPF_CGROUP_DEVICE, controller_path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", + empty_to_root(cgroup_path)); + + finish: + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */ + if (prog_installed) { + bpf_program_free(*prog_installed); + *prog_installed = TAKE_PTR(*prog); + } + return 0; +} + +int bpf_devices_supported(void) { + const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_freep) BPFProgram *program = NULL; + static int supported = -1; + int r; + + /* Checks whether BPF device controller is supported. For this, we check five things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) { + log_debug("Not enough privileges, BPF device control is not supported."); + return supported = 0; + } + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + log_debug("Not running with unified cgroups, BPF device control is not supported."); + return supported = 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &program); + if (r < 0) { + log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + return supported = 1; +} + +static int allow_list_device_pattern( + BPFProgram *prog, + const char *path, + char type, + const unsigned *maj, + const unsigned *min, + CGroupDevicePermissions p) { + + assert(IN_SET(type, 'b', 'c')); + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + if (maj && min) + return bpf_prog_allow_list_device(prog, type, *maj, *min, p); + else if (maj) + return bpf_prog_allow_list_major(prog, type, *maj, p); + else + return bpf_prog_allow_list_class(prog, type, p); + + } else { + char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4]; + int r; + + if (maj && min) + xsprintf(buf, "%c %u:%u %s", type, *maj, *min, cgroup_device_permissions_to_string(p)); + else if (maj) + xsprintf(buf, "%c %u:* %s", type, *maj, cgroup_device_permissions_to_string(p)); + else + xsprintf(buf, "%c *:* %s", type, cgroup_device_permissions_to_string(p)); + + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore + * EINVAL here. */ + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + + return r; + } +} + +int bpf_devices_allow_list_device( + BPFProgram *prog, + const char *path, + const char *node, + CGroupDevicePermissions p) { + + mode_t mode; + dev_t rdev; + int r; + + assert(path); + assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX); + + log_trace("%s: %s %s", __func__, node, cgroup_device_permissions_to_string(p)); + + /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and + * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This + * means clients can use these path without the device node actually around */ + r = device_path_parse_major_minor(node, &mode, &rdev); + if (r < 0) { + if (r != -ENODEV) + return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); + + struct stat st; + if (stat(node, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device %s: %m", node); + + if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) + return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node); + + mode = st.st_mode; + rdev = (dev_t) st.st_rdev; + } + + unsigned maj = major(rdev), min = minor(rdev); + return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, p); +} + +int bpf_devices_allow_list_major( + BPFProgram *prog, + const char *path, + const char *name, + char type, + CGroupDevicePermissions permissions) { + + unsigned maj; + int r; + + assert(path); + assert(IN_SET(type, 'b', 'c')); + assert(permissions >= 0 && permissions < _CGROUP_DEVICE_PERMISSIONS_MAX); + + if (streq(name, "*")) + /* If the name is a wildcard, then apply this list to all devices of this type */ + return allow_list_device_pattern(prog, path, type, NULL, NULL, permissions); + + if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) + /* The name is numeric and suitable as major. In that case, let's take its major, and create + * the entry directly. */ + return allow_list_device_pattern(prog, path, type, &maj, NULL, permissions); + + _cleanup_fclose_ FILE *f = NULL; + bool good = false, any = false; + + f = fopen("/proc/devices", "re"); + if (!f) + return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s: %m", name); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *w, *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/devices: %m"); + if (r == 0) + break; + + if (type == 'c' && streq(line, "Character devices:")) { + good = true; + continue; + } + + if (type == 'b' && streq(line, "Block devices:")) { + good = true; + continue; + } + + if (isempty(line)) { + good = false; + continue; + } + + if (!good) + continue; + + p = strstrip(line); + + w = strpbrk(p, WHITESPACE); + if (!w) + continue; + *w = 0; + + r = safe_atou(p, &maj); + if (r < 0) + continue; + if (maj <= 0) + continue; + + w++; + w += strspn(w, WHITESPACE); + + if (fnmatch(name, w, 0) != 0) + continue; + + any = true; + (void) allow_list_device_pattern(prog, path, type, &maj, NULL, permissions); + } + + if (!any) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "Device allow list pattern \"%s\" did not match anything.", name); + + return 0; +} + +int bpf_devices_allow_list_static( + BPFProgram *prog, + const char *path) { + + static const char auto_devices[] = + "/dev/null\0" "rwm\0" + "/dev/zero\0" "rwm\0" + "/dev/full\0" "rwm\0" + "/dev/random\0" "rwm\0" + "/dev/urandom\0" "rwm\0" + "/dev/tty\0" "rwm\0" + "/dev/ptmx\0" "rwm\0" + /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */ + "/run/systemd/inaccessible/chr\0" "rwm\0" + "/run/systemd/inaccessible/blk\0" "rwm\0"; + int r = 0, k; + + NULSTR_FOREACH_PAIR(node, acc, auto_devices) { + k = bpf_devices_allow_list_device(prog, path, node, cgroup_device_permissions_from_string(acc)); + if (r >= 0 && k < 0) + r = k; + } + + /* PTS (/dev/pts) devices may not be duplicated, but accessed */ + k = bpf_devices_allow_list_major(prog, path, "pts", 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + if (r >= 0 && k < 0) + r = k; + + return r; +} diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h new file mode 100644 index 0000000..5660e1a --- /dev/null +++ b/src/core/bpf-devices.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "cgroup.h" + +typedef struct BPFProgram BPFProgram; + +int bpf_devices_cgroup_init(BPFProgram **ret, CGroupDevicePolicy policy, bool allow_list); +int bpf_devices_apply_policy( + BPFProgram **prog, + CGroupDevicePolicy policy, + bool allow_list, + const char *cgroup_path, + BPFProgram **prog_installed); + +int bpf_devices_supported(void); +int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, CGroupDevicePermissions p); +int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, CGroupDevicePermissions p); +int bpf_devices_allow_list_static(BPFProgram *prog, const char *path); diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c new file mode 100644 index 0000000..66773e1 --- /dev/null +++ b/src/core/bpf-firewall.c @@ -0,0 +1,974 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "in-addr-prefix-util.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "unit.h" +#include "strv.h" +#include "virt.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +enum { + ACCESS_ALLOWED = 1, + ACCESS_DENIED = 2, +}; + +/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */ + +static int add_lookup_instructions( + BPFProgram *p, + int map_fd, + int protocol, + bool is_ingress, + int verdict) { + + int r, addr_offset, addr_size; + + assert(p); + assert(map_fd >= 0); + + switch (protocol) { + + case ETH_P_IP: + addr_size = sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct iphdr, saddr) : + offsetof(struct iphdr, daddr); + break; + + case ETH_P_IPV6: + addr_size = 4 * sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct ip6_hdr, ip6_src.s6_addr) : + offsetof(struct ip6_hdr, ip6_dst.s6_addr); + break; + + default: + return -EAFNOSUPPORT; + } + + do { + /* Compare IPv4 with one word instruction (32-bit) */ + struct bpf_insn insn[] = { + /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0), + + /* + * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address + * + * R1: Pointer to the skb + * R2: Data offset + * R3: Destination buffer on the stack (r10 - 4) + * R4: Number of bytes to read (4) + */ + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV32_IMM(BPF_REG_2, addr_offset), + + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size), + + BPF_MOV32_IMM(BPF_REG_4, addr_size), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + + /* + * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the + * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key' + * has to be set to the maximum possible value. + * + * On success, the looked up value is stored in R0. For this application, the actual + * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any + * matching value. + */ + + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)), + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8), + + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + + } while (false); + + return 0; +} + +static int add_instructions_for_ip_any( + BPFProgram *p, + int verdict) { + int r; + + assert(p); + + const struct bpf_insn insn[] = { + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + r = bpf_program_add_instructions(p, insn, 1); + if (r < 0) + return r; + + return 0; +} + +static int bpf_firewall_compile_bpf( + Unit *u, + const char *prog_name, + bool is_ingress, + BPFProgram **ret, + bool ip_allow_any, + bool ip_deny_any) { + + const struct bpf_insn pre_insn[] = { + /* + * When the eBPF program is entered, R1 contains the address of the skb. + * However, R1-R5 are scratch registers that are not preserved when calling + * into kernel functions, so we need to save anything that's supposed to + * stay around to R6-R9. Save the skb to R6. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* + * Although we cannot access the skb data directly from eBPF programs used in this + * scenario, the kernel has prepared some fields for us to access through struct __sk_buff. + * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7 + * for later use. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)), + + /* + * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet + * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning. + */ + BPF_MOV32_IMM(BPF_REG_8, 0), + }; + + /* + * The access checkers compiled for the configured allowance and denial lists + * write to R8 at runtime. The following code prepares for an early exit that + * skip the accounting if the packet is denied. + * + * R0 = 1 + * if (R8 == ACCESS_DENIED) + * R0 = 0 + * + * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet + * is allowed to pass. + */ + const struct bpf_insn post_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + }; + + _cleanup_(bpf_program_freep) BPFProgram *p = NULL; + int accounting_map_fd, r; + bool access_enabled; + + assert(u); + assert(ret); + + accounting_map_fd = is_ingress ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + + access_enabled = + u->ipv4_allow_map_fd >= 0 || + u->ipv6_allow_map_fd >= 0 || + u->ipv4_deny_map_fd >= 0 || + u->ipv6_deny_map_fd >= 0 || + ip_allow_any || + ip_deny_any; + + if (accounting_map_fd < 0 && !access_enabled) { + *ret = NULL; + return 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p); + if (r < 0) + return r; + + r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return r; + + if (access_enabled) { + /* + * The simple rule this function translates into eBPF instructions is: + * + * - Access will be granted when an address matches an entry in @list_allow + * - Otherwise, access will be denied when an address matches an entry in @list_deny + * - Otherwise, access will be granted + */ + + if (u->ipv4_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv6_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv4_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (u->ipv6_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (ip_allow_any) { + r = add_instructions_for_ip_any(p, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (ip_deny_any) { + r = add_instructions_for_ip_any(p, ACCESS_DENIED); + if (r < 0) + return r; + } + } + + r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return r; + + if (accounting_map_fd >= 0) { + struct bpf_insn insn[] = { + /* + * If R0 == 0, the packet will be denied; skip the accounting instructions in this case. + * The jump label will be fixed up later. + */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0), + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Allow the packet to pass */ + BPF_MOV64_IMM(BPF_REG_0, 1), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } + + do { + /* + * Exit from the eBPF program, R0 contains the verdict. + * 0 means the packet is denied, 1 means the packet may pass. + */ + const struct bpf_insn insn[] = { + BPF_EXIT_INSN() + }; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } while (false); + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) { + struct in_addr_prefix *a; + + assert(n_ipv4); + assert(n_ipv6); + + SET_FOREACH(a, prefixes) + switch (a->family) { + + case AF_INET: + (*n_ipv4)++; + break; + + case AF_INET6: + (*n_ipv6)++; + break; + + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +static int bpf_firewall_add_access_items( + Set *prefixes, + int ipv4_map_fd, + int ipv6_map_fd, + int verdict) { + + struct bpf_lpm_trie_key *key_ipv4, *key_ipv6; + struct in_addr_prefix *a; + uint64_t value = verdict; + int r; + + key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)); + key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4); + + SET_FOREACH(a, prefixes) + switch (a->family) { + + case AF_INET: + key_ipv4->prefixlen = a->prefixlen; + memcpy(key_ipv4->data, &a->address, sizeof(uint32_t)); + + r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value); + if (r < 0) + return r; + + break; + + case AF_INET6: + key_ipv6->prefixlen = a->prefixlen; + memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t)); + + r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value); + if (r < 0) + return r; + + break; + + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +static int bpf_firewall_prepare_access_maps( + Unit *u, + int verdict, + int *ret_ipv4_map_fd, + int *ret_ipv6_map_fd, + bool *ret_has_any) { + + _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF; + size_t n_ipv4 = 0, n_ipv6 = 0; + Unit *p; + int r; + + assert(ret_ipv4_map_fd); + assert(ret_ipv6_map_fd); + assert(ret_has_any); + + for (p = u; p; p = UNIT_GET_SLICE(p)) { + CGroupContext *cc; + Set *prefixes; + bool *reduced; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny; + reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced; + + if (!*reduced) { + r = in_addr_prefixes_reduce(prefixes); + if (r < 0) + return r; + + *reduced = true; + } + + bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6); + + /* Skip making the LPM trie map in cases where we are using "any" in order to hack around + * needing CAP_SYS_ADMIN for allocating LPM trie map. */ + if (in_addr_prefixes_is_any(prefixes)) { + *ret_has_any = true; + return 0; + } + } + + if (n_ipv4 > 0) { + char *name = strjoina("4_", u->id); + ipv4_map_fd = bpf_map_new( + name, + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t), + sizeof(uint64_t), + n_ipv4, + BPF_F_NO_PREALLOC); + if (ipv4_map_fd < 0) + return ipv4_map_fd; + } + + if (n_ipv6 > 0) { + char *name = strjoina("6_", u->id); + ipv6_map_fd = bpf_map_new( + name, + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4, + sizeof(uint64_t), + n_ipv6, + BPF_F_NO_PREALLOC); + if (ipv6_map_fd < 0) + return ipv6_map_fd; + } + + for (p = u; p; p = UNIT_GET_SLICE(p)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, + ipv4_map_fd, ipv6_map_fd, verdict); + if (r < 0) + return r; + } + + *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd); + *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd); + *ret_has_any = false; + return 0; +} + +static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) { + int r; + + assert(u); + assert(fd_ingress); + assert(fd_egress); + + if (enabled) { + if (*fd_ingress < 0) { + char *name = strjoina("I_", u->id); + r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_ingress = r; + } + + if (*fd_egress < 0) { + char *name = strjoina("E_", u->id); + r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_egress = r; + } + + } else { + *fd_ingress = safe_close(*fd_ingress); + *fd_egress = safe_close(*fd_egress); + + zero(u->ip_accounting_extra); + } + + return 0; +} + +int bpf_firewall_compile(Unit *u) { + const char *ingress_name = NULL, *egress_name = NULL; + bool ip_allow_any = false, ip_deny_any = false; + CGroupContext *cc; + int r, supported; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF firewalling not supported, proceeding without."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) + /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice + * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption + * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is + * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at + * all, either. */ + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units."); + + /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15 + * kernel). */ + if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + ingress_name = "sd_fw_ingress"; + egress_name = "sd_fw_egress"; + } + + /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves, + * but we reuse the accounting maps. That way the firewall in effect always maps to the actual + * configuration, but we don't flush out the accounting unnecessarily */ + + u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress); + u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + if (u->type != UNIT_SLICE) { + /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf + * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that + * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this + * means that all configure IP access rules *will* take effect on processes, even though we never + * compile them for inner nodes. */ + + r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m"); + + r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m"); + } + + r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m"); + + r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m"); + + r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m"); + + return 0; +} + +static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) { + set_clear(*set); + + STRV_FOREACH(bpf_fs_path, filter_paths) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + int r; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m"); + + r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path); + + r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog)); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int bpf_firewall_load_custom(Unit *u) { + CGroupContext *cc; + int r, supported; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + if (!(cc->ip_filters_ingress || cc->ip_filters_egress)) + return 0; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs."); + + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress); + if (r < 0) + return r; + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress); + if (r < 0) + return r; + + return 0; +} + +static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) { + BPFProgram *prog; + int r; + + assert(u); + + set_clear(*set_installed); + r = set_ensure_allocated(set_installed, &bpf_program_hash_ops); + if (r < 0) + return log_oom(); + + SET_FOREACH_MOVE(prog, *set_installed, *set) { + r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path); + } + return 0; +} + +int bpf_firewall_install(Unit *u) { + _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL; + _cleanup_free_ char *path = NULL; + CGroupContext *cc; + int r, supported; + uint32_t flags; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + if (!u->cgroup_path) + return -EINVAL; + if (!u->cgroup_realized) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF firewalling not supported, proceeding without."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && + (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress))) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs."); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m"); + + flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0; + + if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) { + /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only + * after attaching the new programs, so that there's no time window where neither program is + * attached. (There will be a program where both are attached, but that's OK, since this is a + * security feature where we rather want to lock down too much than too little */ + ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed); + ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed); + } else { + /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly + * detach them) right before attaching the new program, to minimize the time window when we + * don't account for IP traffic. */ + u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed); + u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed); + } + + if (u->ip_bpf_egress) { + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags); + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path); + + /* Remember that this BPF program is installed now. */ + u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress); + } + + if (u->ip_bpf_ingress) { + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags); + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path); + + u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress); + } + + /* And now, definitely get rid of the old programs, and detach them */ + ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall); + ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall); + + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed); + if (r < 0) + return r; + + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed); + if (r < 0) + return r; + + return 0; +} + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) { + uint64_t key, packets; + int r; + + if (map_fd < 0) + return -EBADF; + + if (ret_packets) { + key = MAP_KEY_PACKETS; + r = bpf_map_lookup_element(map_fd, &key, &packets); + if (r < 0) + return r; + } + + if (ret_bytes) { + key = MAP_KEY_BYTES; + r = bpf_map_lookup_element(map_fd, &key, ret_bytes); + if (r < 0) + return r; + } + + if (ret_packets) + *ret_packets = packets; + + return 0; +} + +int bpf_firewall_reset_accounting(int map_fd) { + uint64_t key, value = 0; + int r; + + if (map_fd < 0) + return -EBADF; + + key = MAP_KEY_PACKETS; + r = bpf_map_update_element(map_fd, &key, &value); + if (r < 0) + return r; + + key = MAP_KEY_BYTES; + return bpf_map_update_element(map_fd, &key, &value); +} + +static int bpf_firewall_unsupported_reason = 0; + +int bpf_firewall_supported(void) { + const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_freep) BPFProgram *program = NULL; + static int supported = -1; + union bpf_attr attr; + int r; + + /* Checks whether BPF firewalling is supported. For this, we check the following things: + * + * - whether the unified hierarchy is being used + * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require + * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require + */ + if (supported >= 0) + return supported; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), + "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* prog_name is NULL since it is supported only starting from v4.15 kernel. */ + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF + * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF + * program if we can't do a thing with it later? + * + * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if + * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the + * parameters are validated however, and that'll fail with EBADF then. */ + + // FIXME: Clang doesn't 0-pad with structured initialization, causing + // the kernel to reject the bpf_attr as invalid. See: + // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65 + // Ideally it should behave like GCC, so that we can remove these workarounds. + zero(attr); + attr.attach_type = BPF_CGROUP_INET_EGRESS; + attr.target_fd = -EBADF; + attr.attach_bpf_fd = -EBADF; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) { + if (errno != EBADF) { + bpf_firewall_unsupported_reason = + log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* YAY! */ + } else { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EBADE), + "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? " + "Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported + * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH + * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll + * get EINVAL if it's not supported, and EBADF as before if it is available. + * Use probe result as the indicator that program name is also supported since they both were + * added in kernel 4.15. */ + + zero(attr); + attr.attach_type = BPF_CGROUP_INET_EGRESS; + attr.target_fd = -EBADF; + attr.attach_bpf_fd = -EBADF; + attr.attach_flags = BPF_F_ALLOW_MULTI; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) { + if (errno == EBADF) { + log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!"); + return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI; + } + + if (errno == EINVAL) + log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported."); + else + log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m"); + + return supported = BPF_FIREWALL_SUPPORTED; + } else { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EBADE), + "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? " + "Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } +} + +void emit_bpf_firewall_warning(Unit *u) { + static bool warned = false; + + assert(u); + assert(u->manager); + + if (warned || MANAGER_IS_TEST_RUN(u->manager)) + return; + + bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0; + + log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason, + "unit configures an IP firewall, but %s.\n" + "(This warning is only shown for the first unit using IP firewalling.)", + getuid() != 0 ? "not running as root" : + "the local system does not support BPF/cgroup firewalling"); + warned = true; +} + +void bpf_firewall_close(Unit *u) { + assert(u); + + u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd); + u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress); + u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed); + u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress); + u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed); + + u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress); + u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress); + u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed); + u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed); +} diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h new file mode 100644 index 0000000..58b401f --- /dev/null +++ b/src/core/bpf-firewall.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "unit.h" + +enum { + BPF_FIREWALL_UNSUPPORTED = 0, + BPF_FIREWALL_SUPPORTED = 1, + BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2, +}; + +int bpf_firewall_supported(void); + +int bpf_firewall_compile(Unit *u); +int bpf_firewall_install(Unit *u); +int bpf_firewall_load_custom(Unit *u); + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets); +int bpf_firewall_reset_accounting(int map_fd); + +void emit_bpf_firewall_warning(Unit *u); + +void bpf_firewall_close(Unit *u); diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c new file mode 100644 index 0000000..cff2f61 --- /dev/null +++ b/src/core/bpf-foreign.c @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-foreign.h" +#include "bpf-program.h" +#include "cgroup.h" +#include "memory-util.h" +#include "missing_magic.h" +#include "mountpoint-util.h" +#include "set.h" +#include "stat-util.h" + +typedef struct BPFForeignKey BPFForeignKey; +struct BPFForeignKey { + uint32_t prog_id; + uint32_t attach_type; +}; + +static int bpf_foreign_key_new(uint32_t prog_id, + enum bpf_attach_type attach_type, + BPFForeignKey **ret) { + _cleanup_free_ BPFForeignKey *p = NULL; + + assert(ret); + + p = new(BPFForeignKey, 1); + if (!p) + return -ENOMEM; + + *p = (BPFForeignKey) { + .prog_id = prog_id, + .attach_type = attach_type, + }; + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeignKey *b) { + int r = CMP(a->prog_id, b->prog_id); + if (r != 0) + return r; + + return CMP(a->attach_type, b->attach_type); +} + +static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) { + siphash24_compress(&p->prog_id, sizeof(p->prog_id), h); + siphash24_compress(&p->attach_type, sizeof(p->attach_type), h); +} + +DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops, + BPFForeignKey, bpf_foreign_key_hash_func, bpf_foreign_key_compare_func, free, + BPFProgram, bpf_program_free); + +static int attach_programs(Unit *u, const char *path, Hashmap* foreign_by_key, uint32_t attach_flags) { + const BPFForeignKey *key; + BPFProgram *prog; + int r, ret = 0; + + assert(u); + + HASHMAP_FOREACH_KEY(prog, key, foreign_by_key) { + r = bpf_program_cgroup_attach(prog, key->attach_type, path, attach_flags); + if (r < 0) { + log_unit_error_errno(u, r, "bpf-foreign: Attaching foreign BPF program to cgroup %s failed: %m", path); + if (ret >= 0) + ret = r; + } + } + + return ret; +} + +/* + * Prepare foreign BPF program for installation: + * - Load the program from BPF filesystem to the kernel; + * - Store program FD identified by program ID and attach type in the unit. + */ +static int bpf_foreign_prepare( + Unit *u, + enum bpf_attach_type attach_type, + const char *bpffs_path) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + _cleanup_free_ BPFForeignKey *key = NULL; + uint32_t prog_id; + int r; + + assert(u); + assert(bpffs_path); + + r = path_is_fs_type(bpffs_path, BPF_FS_MAGIC); + if (r == -ENOENT) { + log_unit_warning_errno(u, r, "bpf-foreign: foreign program %s does not exist, skipping.", bpffs_path); + return 0; + } + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-foreign: Failed to determine filesystem type of %s: %m", bpffs_path); + if (r == 0) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "bpf-foreign: Path in BPF filesystem is expected."); + + r = bpf_program_new_from_bpffs_path(bpffs_path, &prog); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m"); + + r = bpf_program_get_id_by_fd(prog->kernel_fd, &prog_id); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to get BPF program id from fd: %m"); + + r = bpf_foreign_key_new(prog_id, attach_type, &key); + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path); + + r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog); + if (r == -EEXIST) { + log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m"); + return 0; + } + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to put foreign BPF program into map: %m"); + + TAKE_PTR(key); + TAKE_PTR(prog); + + return 0; +} + +int bpf_foreign_install(Unit *u) { + _cleanup_free_ char *cgroup_path = NULL; + CGroupContext *cc; + int r, ret = 0; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m"); + + LIST_FOREACH(programs, p, cc->bpf_foreign_programs) { + r = bpf_foreign_prepare(u, p->attach_type, p->bpffs_path); + if (r < 0 && ret >= 0) + ret = r; + } + + r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI); + return ret < 0 ? ret : r; +} diff --git a/src/core/bpf-foreign.h b/src/core/bpf-foreign.h new file mode 100644 index 0000000..e387b1b --- /dev/null +++ b/src/core/bpf-foreign.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include "unit.h" + +static inline int bpf_foreign_supported(void) { + return cg_all_unified(); +} + +/* + * Attach cgroup-bpf programs foreign to systemd, i.e. loaded to the kernel by an entity + * external to systemd. + */ +int bpf_foreign_install(Unit *u); diff --git a/src/core/bpf-lsm.c b/src/core/bpf-lsm.c new file mode 100644 index 0000000..216fc34 --- /dev/null +++ b/src/core/bpf-lsm.c @@ -0,0 +1,320 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-lsm.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "filesystems.h" +#include "log.h" +#include "lsm-util.h" +#include "manager.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "stat-util.h" +#include "strv.h" + +#if BPF_FRAMEWORK +/* libbpf, clang and llc compile time dependencies are satisfied */ +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/restrict_fs/restrict-fs-skel.h" + +#define CGROUP_HASH_SIZE_MAX 2048 + +static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) { + /* restrict_fs_bpf__destroy handles object == NULL case */ + (void) restrict_fs_bpf__destroy(obj); + + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free); + +static bool bpf_can_link_lsm_program(struct bpf_program *prog) { + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + + assert(prog); + + link = sym_bpf_program__attach_lsm(prog); + + /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory + * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence + * BPF_LSM_MAC attach type) is not supported. */ + return sym_libbpf_get_error(link) == 0; +} + +static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + _cleanup_close_ int inner_map_fd = -EBADF; + int r; + + assert(ret_obj); + + obj = restrict_fs_bpf__open(); + if (!obj) + return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m"); + + /* TODO Maybe choose a number based on runtime information? */ + r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m", + sym_bpf_map__name(obj->maps.cgroup_hash)); + + /* Dummy map to satisfy the verifier */ + inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL); + if (inner_map_fd < 0) + return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m"); + + r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m"); + + r = restrict_fs_bpf__load(obj); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m"); + + *ret_obj = TAKE_PTR(obj); + + return 0; +} + +bool lsm_bpf_supported(bool initialize) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + if (!initialize) + return false; + + if (!cgroup_bpf_supported()) + return (supported = false); + + r = lsm_supported("bpf"); + if (r < 0) { + log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m"); + return (supported = false); + } + if (r == 0) { + log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported"); + return (supported = false); + } + + r = prepare_restrict_fs_bpf(&obj); + if (r < 0) + return (supported = false); + + if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) { + log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-lsm: Failed to link program; assuming BPF LSM is not available"); + return (supported = false); + } + + return (supported = true); +} + +int lsm_bpf_setup(Manager *m) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + int r; + + assert(m); + + r = prepare_restrict_fs_bpf(&obj); + if (r < 0) + return r; + + link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems); + r = sym_libbpf_get_error(link); + if (r != 0) + return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m", + sym_bpf_program__name(obj->progs.restrict_filesystems)); + + log_info("bpf-lsm: LSM BPF program attached"); + + obj->links.restrict_filesystems = TAKE_PTR(link); + m->restrict_fs = TAKE_PTR(obj); + + return 0; +} + +int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) { + uint32_t dummy_value = 1, zero = 0; + const char *fs; + const statfs_f_type_t *magic; + int r; + + assert(filesystems); + assert(outer_map_fd >= 0); + + int inner_map_fd = compat_bpf_map_create( + BPF_MAP_TYPE_HASH, + NULL, + sizeof(uint32_t), + sizeof(uint32_t), + 128U, /* Should be enough for all filesystem types */ + NULL); + if (inner_map_fd < 0) + return log_error_errno(errno, "bpf-lsm: Failed to create inner BPF map: %m"); + + if (sym_bpf_map_update_elem(outer_map_fd, &cgroup_id, &inner_map_fd, BPF_ANY) != 0) + return log_error_errno(errno, "bpf-lsm: Error populating BPF map: %m"); + + uint32_t allow = allow_list; + + /* Use key 0 to store whether this is an allow list or a deny list */ + if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0) + return log_error_errno(errno, "bpf-lsm: Error initializing map: %m"); + + SET_FOREACH(fs, filesystems) { + r = fs_type_from_string(fs, &magic); + if (r < 0) { + log_warning("bpf-lsm: Invalid filesystem name '%s', ignoring.", fs); + continue; + } + + log_debug("bpf-lsm: Restricting filesystem access to '%s'", fs); + + for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) { + if (magic[i] == 0) + break; + + if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) { + r = log_error_errno(errno, "bpf-lsm: Failed to update BPF map: %m"); + + if (sym_bpf_map_delete_elem(outer_map_fd, &cgroup_id) != 0) + log_debug_errno(errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m"); + + return r; + } + } + } + + return 0; +} + +int lsm_bpf_cleanup(const Unit *u) { + assert(u); + assert(u->manager); + + /* If we never successfully detected support, there is nothing to clean up. */ + if (!lsm_bpf_supported(/* initialize = */ false)) + return 0; + + if (!u->manager->restrict_fs) + return 0; + + if (u->cgroup_id == 0) + return 0; + + int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash); + if (fd < 0) + return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m"); + + if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT) + return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m"); + + return 0; +} + +int lsm_bpf_map_restrict_fs_fd(Unit *unit) { + assert(unit); + assert(unit->manager); + + if (!unit->manager->restrict_fs) + return -ENOMEDIUM; + + return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash); +} + +void lsm_bpf_destroy(struct restrict_fs_bpf *prog) { + restrict_fs_bpf__destroy(prog); +} +#else /* ! BPF_FRAMEWORK */ +bool lsm_bpf_supported(bool initialize) { + return false; +} + +int lsm_bpf_setup(Manager *m) { + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m"); +} + +int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) { + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m"); +} + +int lsm_bpf_cleanup(const Unit *u) { + return 0; +} + +int lsm_bpf_map_restrict_fs_fd(Unit *unit) { + return -ENOMEDIUM; +} + +void lsm_bpf_destroy(struct restrict_fs_bpf *prog) { + return; +} +#endif + +int lsm_bpf_parse_filesystem( + const char *name, + Set **filesystems, + FilesystemParseFlags flags, + const char *unit, + const char *filename, + unsigned line) { + int r; + + assert(name); + assert(filesystems); + + if (name[0] == '@') { + const FilesystemSet *set; + + set = filesystem_set_find(name); + if (!set) { + log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "bpf-lsm: Unknown filesystem group, ignoring: %s", name); + return 0; + } + + NULSTR_FOREACH(i, set->value) { + /* Call ourselves again, for the group to parse. Note that we downgrade logging here + * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table + * are our own problem, not a problem in user configuration data and we shouldn't + * pretend otherwise by complaining about them. */ + r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line); + if (r < 0) + return r; + } + } else { + /* If we previously wanted to forbid access to a filesystem and now + * we want to allow it, then remove it from the list. */ + if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) { + r = set_put_strdup(filesystems, name); + if (r == -ENOMEM) + return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM; + if (r < 0 && r != -EEXIST) /* When already in set, ignore */ + return r; + } else + free(set_remove(*filesystems, name)); + } + + return 0; +} diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h new file mode 100644 index 0000000..a6eda19 --- /dev/null +++ b/src/core/bpf-lsm.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" + +typedef enum FilesystemParseFlags { + FILESYSTEM_PARSE_INVERT = 1 << 0, + FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1, + FILESYSTEM_PARSE_LOG = 1 << 2, +} FilesystemParseFlags; + +typedef struct Unit Unit; +typedef struct Manager Manager; + +typedef struct restrict_fs_bpf restrict_fs_bpf; + +bool lsm_bpf_supported(bool initialize); +int lsm_bpf_setup(Manager *m); +int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list); +int lsm_bpf_cleanup(const Unit *u); +int lsm_bpf_map_restrict_fs_fd(Unit *u); +void lsm_bpf_destroy(struct restrict_fs_bpf *prog); +int lsm_bpf_parse_filesystem(const char *name, + Set **filesystems, + FilesystemParseFlags flags, + const char *unit, + const char *filename, + unsigned line); diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c new file mode 100644 index 0000000..9f290ab --- /dev/null +++ b/src/core/bpf-socket-bind.c @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if BPF_FRAMEWORK +#include +#endif + +#include "fd-util.h" +#include "bpf-socket-bind.h" + +#if BPF_FRAMEWORK +/* libbpf, clang, llvm and bpftool compile time dependencies are satisfied */ +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/socket_bind/socket-bind-api.bpf.h" +#include "bpf/socket_bind/socket-bind-skel.h" + +static struct socket_bind_bpf *socket_bind_bpf_free(struct socket_bind_bpf *obj) { + /* socket_bind_bpf__destroy handles object == NULL case */ + (void) socket_bind_bpf__destroy(obj); + + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct socket_bind_bpf *, socket_bind_bpf_free); + +static int update_rules_map( + int map_fd, + CGroupSocketBindItem *head) { + + uint32_t i = 0; + + assert(map_fd >= 0); + + LIST_FOREACH(socket_bind_items, item, head) { + struct socket_bind_rule val = { + .address_family = (uint32_t) item->address_family, + .protocol = item->ip_protocol, + .nr_ports = item->nr_ports, + .port_min = item->port_min, + }; + + uint32_t key = i++; + + if (sym_bpf_map_update_elem(map_fd, &key, &val, BPF_ANY) != 0) + return -errno; + } + + return 0; +} + +static int prepare_socket_bind_bpf( + Unit *u, + CGroupSocketBindItem *allow, + CGroupSocketBindItem *deny, + struct socket_bind_bpf **ret_obj) { + + _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL; + size_t allow_count = 0, deny_count = 0; + int allow_map_fd, deny_map_fd, r; + + assert(ret_obj); + + LIST_FOREACH(socket_bind_items, item, allow) + allow_count++; + + LIST_FOREACH(socket_bind_items, item, deny) + deny_count++; + + if (allow_count > SOCKET_BIND_MAX_RULES) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL), + "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES); + + if (deny_count > SOCKET_BIND_MAX_RULES) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL), + "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES); + + obj = socket_bind_bpf__open(); + if (!obj) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "bpf-socket-bind: Failed to open BPF object: %m"); + + if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_allow, MAX(allow_count, 1u)) != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, + "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_allow)); + + if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_deny, MAX(deny_count, 1u)) != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, + "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_deny)); + + if (socket_bind_bpf__load(obj) != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, + "bpf-socket-bind: Failed to load BPF object: %m"); + + allow_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_allow); + assert(allow_map_fd >= 0); + + r = update_rules_map(allow_map_fd, allow); + if (r < 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, + "bpf-socket-bind: Failed to put socket bind allow rules into BPF map '%s'", + sym_bpf_map__name(obj->maps.sd_bind_allow)); + + deny_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_deny); + assert(deny_map_fd >= 0); + + r = update_rules_map(deny_map_fd, deny); + if (r < 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, + "bpf-socket-bind: Failed to put socket bind deny rules into BPF map '%s'", + sym_bpf_map__name(obj->maps.sd_bind_deny)); + + *ret_obj = TAKE_PTR(obj); + return 0; +} + +int bpf_socket_bind_supported(void) { + _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL; + int r; + + if (!cgroup_bpf_supported()) + return false; + + if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, /*opts=*/NULL)) { + log_debug("bpf-socket-bind: BPF program type cgroup_sock_addr is not supported"); + return false; + } + + r = prepare_socket_bind_bpf(/*unit=*/NULL, /*allow_rules=*/NULL, /*deny_rules=*/NULL, &obj); + if (r < 0) { + log_debug_errno(r, "bpf-socket-bind: socket bind filtering is not supported: %m"); + return false; + } + + return bpf_can_link_program(obj->progs.sd_bind4); +} + +int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) { + int r; + + assert(u); + + if (!u->initial_socket_bind_link_fds) { + u->initial_socket_bind_link_fds = fdset_new(); + if (!u->initial_socket_bind_link_fds) + return log_oom(); + } + + r = fdset_put(u->initial_socket_bind_link_fds, fd); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd); + + return 0; +} + +static int socket_bind_install_impl(Unit *u) { + _cleanup_(bpf_link_freep) struct bpf_link *ipv4 = NULL, *ipv6 = NULL; + _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL; + _cleanup_free_ char *cgroup_path = NULL; + _cleanup_close_ int cgroup_fd = -EBADF; + CGroupContext *cc; + int r; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m"); + + if (!cc->socket_bind_allow && !cc->socket_bind_deny) + return 0; + + r = prepare_socket_bind_bpf(u, cc->socket_bind_allow, cc->socket_bind_deny, &obj); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to load BPF object: %m"); + + cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC, 0); + if (cgroup_fd < 0) + return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path); + + ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd); + r = sym_libbpf_get_error(ipv4); + if (r != 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m", + sym_bpf_program__name(obj->progs.sd_bind4)); + + ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd); + r = sym_libbpf_get_error(ipv6); + if (r != 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m", + sym_bpf_program__name(obj->progs.sd_bind6)); + + u->ipv4_socket_bind_link = TAKE_PTR(ipv4); + u->ipv6_socket_bind_link = TAKE_PTR(ipv6); + + return 0; +} + +int bpf_socket_bind_install(Unit *u) { + int r; + + assert(u); + + r = socket_bind_install_impl(u); + if (r == -ENOMEM) + return r; + + fdset_close(u->initial_socket_bind_link_fds); + return r; +} + +int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + + r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link); + if (r < 0) + return r; + + return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link); +} + +#else /* ! BPF_FRAMEWORK */ +int bpf_socket_bind_supported(void) { + return false; +} + +int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) { + return 0; +} + +int bpf_socket_bind_install(Unit *u) { + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-socket-bind: Failed to install; BPF framework is not supported"); +} + +int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) { + return 0; +} +#endif diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h new file mode 100644 index 0000000..7d426df --- /dev/null +++ b/src/core/bpf-socket-bind.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "fdset.h" +#include "unit.h" + +int bpf_socket_bind_supported(void); + +/* Add BPF link fd created before daemon-reload or daemon-reexec. FDs will be closed at the end of + * socket_bind_install. */ +int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd); + +int bpf_socket_bind_install(Unit *u); + +int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds); diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c new file mode 100644 index 0000000..6fe229e --- /dev/null +++ b/src/core/bpf-util.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-dlopen.h" +#include "bpf-util.h" +#include "cgroup-util.h" +#include "initrd-util.h" +#include "log.h" + +bool cgroup_bpf_supported(void) { + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) { + log_warning_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + return (supported = false); + } + + if (r == 0) { + log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Not running with unified cgroup hierarchy, disabling cgroup BPF features."); + return (supported = false); + } + + r = dlopen_bpf(); + if (r < 0) { + log_full_errno(in_initrd() ? LOG_DEBUG : LOG_INFO, + r, "Failed to open libbpf, cgroup BPF features disabled: %m"); + return (supported = false); + } + + return (supported = true); +} diff --git a/src/core/bpf-util.h b/src/core/bpf-util.h new file mode 100644 index 0000000..a6c55cd --- /dev/null +++ b/src/core/bpf-util.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +bool cgroup_bpf_supported(void); diff --git a/src/core/bpf/restrict_fs/meson.build b/src/core/bpf/restrict_fs/meson.build new file mode 100644 index 0000000..69cde02 --- /dev/null +++ b/src/core/bpf/restrict_fs/meson.build @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +restrict_fs_bpf_o_unstripped = custom_target( + 'restrict-fs.bpf.unstripped.o', + input : 'restrict-fs.bpf.c', + output : 'restrict-fs.bpf.unstripped.o', + command : bpf_o_unstripped_cmd) + +restrict_fs_bpf_o = custom_target( + 'restrict-fs.bpf.o', + input : restrict_fs_bpf_o_unstripped, + output : 'restrict-fs.bpf.o', + command : bpf_o_cmd) + +restrict_fs_skel_h = custom_target( + 'restrict-fs.skel.h', + input : restrict_fs_bpf_o, + output : 'restrict-fs.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/core/bpf/restrict_fs/restrict-fs-skel.h b/src/core/bpf/restrict_fs/restrict-fs-skel.h new file mode 100644 index 0000000..412cf62 --- /dev/null +++ b/src/core/bpf/restrict_fs/restrict-fs-skel.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton + +#include "bpf/restrict_fs/restrict-fs.skel.h" diff --git a/src/core/bpf/restrict_fs/restrict-fs.bpf.c b/src/core/bpf/restrict_fs/restrict-fs.bpf.c new file mode 100644 index 0000000..eb5ed3e --- /dev/null +++ b/src/core/bpf/restrict_fs/restrict-fs.bpf.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct super_block { + unsigned long int s_magic; +} __attribute__((preserve_access_index)); + +struct inode { + struct super_block *i_sb; +} __attribute__((preserve_access_index)); + +struct file { + struct inode *f_inode; +} __attribute__((preserve_access_index)); + +/* + * max_entries is set from user space with the bpf_map__set_max_entries helper. + * */ +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __type(key, uint64_t); /* cgroup ID */ + __type(value, uint32_t); /* fs magic set */ +} cgroup_hash SEC(".maps"); + +SEC("lsm/file_open") +int BPF_PROG(restrict_filesystems, struct file *file, int ret) +{ + unsigned long raw_magic_number; + uint64_t cgroup_id; + uint32_t *value, *magic_map, magic_number, zero = 0, *is_allow; + + /* ret is the return value from the previous BPF program or 0 if it's + * the first hook */ + if (ret != 0) + return ret; + + BPF_CORE_READ_INTO(&raw_magic_number, file, f_inode, i_sb, s_magic); + /* super_block.s_magic is unsigned long, but magic_map keys are + * uint32_t. Using s_magic as-is would fail on big-endian systems, + * which have 64-bit unsigned long. So cast it. */ + magic_number = (uint32_t)raw_magic_number; + + cgroup_id = bpf_get_current_cgroup_id(); + + magic_map = bpf_map_lookup_elem(&cgroup_hash, &cgroup_id); + if (!magic_map) + return 0; + + is_allow = bpf_map_lookup_elem(magic_map, &zero); + if (!is_allow) + /* Malformed map, it doesn't include whether it's an allow list + * or a deny list. Allow. */ + return 0; + + if (*is_allow) { + /* Allow-list: Allow access only if magic_number present in inner map */ + if (!bpf_map_lookup_elem(magic_map, &magic_number)) + return -EPERM; + } else { + /* Deny-list: Allow access only if magic_number is not present in inner map */ + if (bpf_map_lookup_elem(magic_map, &magic_number)) + return -EPERM; + } + + return 0; +} + +static const char _license[] SEC("license") = "GPL"; diff --git a/src/core/bpf/restrict_ifaces/meson.build b/src/core/bpf/restrict_ifaces/meson.build new file mode 100644 index 0000000..5f36178 --- /dev/null +++ b/src/core/bpf/restrict_ifaces/meson.build @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +restrict_ifaces_bpf_o_unstripped = custom_target( + 'restrict-ifaces.bpf.unstripped.o', + input : 'restrict-ifaces.bpf.c', + output : 'restrict-ifaces.bpf.unstripped.o', + command : bpf_o_unstripped_cmd) + +restrict_ifaces_bpf_o = custom_target( + 'restrict-ifaces.bpf.o', + input : restrict_ifaces_bpf_o_unstripped, + output : 'restrict-ifaces.bpf.o', + command : bpf_o_cmd) + +restrict_ifaces_skel_h = custom_target( + 'restrict-ifaces.skel.h', + input : restrict_ifaces_bpf_o, + output : 'restrict-ifaces.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h new file mode 100644 index 0000000..f937490 --- /dev/null +++ b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton + +#include "bpf/restrict_ifaces/restrict-ifaces.skel.h" diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c new file mode 100644 index 0000000..32cde5c --- /dev/null +++ b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* must precede due to integer types + * in bpf helpers signatures. + */ +#include +#include + +const volatile __u8 is_allow_list = 0; + +/* Map containing the network interfaces indexes. + * The interpretation of the map depends on the value of is_allow_list. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, __u8); +} sd_restrictif SEC(".maps"); + +#define DROP 0 +#define PASS 1 + +static __always_inline int restrict_network_interfaces_impl(const struct __sk_buff *sk) { + __u32 zero = 0, ifindex; + __u8 *lookup_result; + + ifindex = sk->ifindex; + lookup_result = bpf_map_lookup_elem(&sd_restrictif, &ifindex); + if (is_allow_list) { + /* allow-list: let the packet pass if iface in the list */ + if (lookup_result) + return PASS; + } else { + /* deny-list: let the packet pass if iface *not* in the list */ + if (!lookup_result) + return PASS; + } + + return DROP; +} + +SEC("cgroup_skb/egress") +int sd_restrictif_e(const struct __sk_buff *sk) { + return restrict_network_interfaces_impl(sk); +} + +SEC("cgroup_skb/ingress") +int sd_restrictif_i(const struct __sk_buff *sk) { + return restrict_network_interfaces_impl(sk); +} + +static const char _license[] SEC("license") = "LGPL-2.1-or-later"; diff --git a/src/core/bpf/socket_bind/meson.build b/src/core/bpf/socket_bind/meson.build new file mode 100644 index 0000000..05a2b9d --- /dev/null +++ b/src/core/bpf/socket_bind/meson.build @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +socket_bind_bpf_o_unstripped = custom_target( + 'socket-bind.bpf.unstripped.o', + input : 'socket-bind.bpf.c', + output : 'socket-bind.bpf.unstripped.o', + command : bpf_o_unstripped_cmd) + +socket_bind_bpf_o = custom_target( + 'socket-bind.bpf.o', + input : socket_bind_bpf_o_unstripped, + output : 'socket-bind.bpf.o', + command : bpf_o_cmd) + +socket_bind_skel_h = custom_target( + 'socket-bind.skel.h', + input : socket_bind_bpf_o, + output : 'socket-bind.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/core/bpf/socket_bind/socket-bind-api.bpf.h b/src/core/bpf/socket_bind/socket-bind-api.bpf.h new file mode 100644 index 0000000..277b9bb --- /dev/null +++ b/src/core/bpf/socket_bind/socket-bind-api.bpf.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include + +/* + * Bind rule is matched with socket fields accessible to cgroup/bind{4,6} hook + * through bpf_sock_addr struct. + * 'address_family' is expected to be one of AF_UNSPEC, AF_INET or AF_INET6. + * Matching by family is bypassed for rules with AF_UNSPEC set, which makes the + * rest of a rule applicable for both IPv4 and IPv6 addresses. + * If matching by family is either successful or bypassed, a rule and a socket + * are matched by ip protocol. + * If 'protocol' is 0, matching is bypassed. + * 'nr_ports' and 'port_min' fields specify a set of ports to match a user port + * with. + * If 'nr_ports' is 0, matching by port is bypassed, making that rule applicable + * for all possible ports, e.g. [1, 65535] range. Thus a rule with + * 'address_family', 'protocol' and 'nr_ports' equal to AF_UNSPEC, 0 and 0 + * correspondingly forms 'allow any' or 'deny any' cases. + * For positive 'nr_ports', a user_port lying in a range from 'port_min' to' + * 'port_min' + 'nr_ports' exclusively is considered to be a match. 'nr_ports' + * equalling to 1 forms a rule for a single port. + * Ports are in host order. + * + * Examples: + * AF_UNSPEC, 1, 0, 7777: match IPv4 and IPv6 addresses with 7777 user port; + * + * AF_INET, 1023, 0, 1: match IPv4 addresses with user port in [1, 1023] + * range inclusively; + * + * AF_INET6, 0, 0, 0: match IPv6 addresses; + * + * AF_UNSPEC, 0, 0, 0: match IPv4 and IPv6 addresses; + * + * AF_INET6, IPPROTO_TCP, 0, 0: match IPv6/TCP addresses. + */ + +struct socket_bind_rule { + __u32 address_family; + __u32 protocol; + __u16 nr_ports; + __u16 port_min; +}; + +#define SOCKET_BIND_MAX_RULES 128 diff --git a/src/core/bpf/socket_bind/socket-bind-skel.h b/src/core/bpf/socket_bind/socket-bind-skel.h new file mode 100644 index 0000000..e0d1626 --- /dev/null +++ b/src/core/bpf/socket_bind/socket-bind-skel.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton + +#include "bpf/socket_bind/socket-bind.skel.h" diff --git a/src/core/bpf/socket_bind/socket-bind.bpf.c b/src/core/bpf/socket_bind/socket-bind.bpf.c new file mode 100644 index 0000000..b7972a8 --- /dev/null +++ b/src/core/bpf/socket_bind/socket-bind.bpf.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include "socket-bind-api.bpf.h" +/* must precede due to + * does not depend from type header by design. + */ +#include +#include +#include +#include +#include +#include + +/* + * max_entries is set from user space with bpf_map__set_max_entries helper. + */ +struct socket_bind_map_t { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct socket_bind_rule); +}; + +enum socket_bind_action { + SOCKET_BIND_DENY = 0, + SOCKET_BIND_ALLOW = 1, +}; + +struct socket_bind_map_t sd_bind_allow SEC(".maps"); +struct socket_bind_map_t sd_bind_deny SEC(".maps"); + +static __always_inline bool match_af( + __u8 address_family, const struct socket_bind_rule *r) { + return r->address_family == AF_UNSPEC || address_family == r->address_family; +} + +static __always_inline bool match_protocol( + __u32 protocol, const struct socket_bind_rule *r) { + return r->protocol == 0 || r->protocol == protocol; +} + +static __always_inline bool match_user_port( + __u16 port, const struct socket_bind_rule *r) { + return r->nr_ports == 0 || + (port >= r->port_min && port < r->port_min + (__u32) r->nr_ports); +} + +static __always_inline bool match( + __u8 address_family, + __u32 protocol, + __u16 port, + const struct socket_bind_rule *r) { + return match_af(address_family, r) && + match_protocol(protocol, r) && + match_user_port(port, r); +} + +static __always_inline bool match_rules( + struct bpf_sock_addr *ctx, + struct socket_bind_map_t *rules) { + volatile __u32 user_port = ctx->user_port; + __u16 port = (__u16)bpf_ntohs(user_port); + + for (__u32 i = 0; i < SOCKET_BIND_MAX_RULES; ++i) { + const __u32 key = i; + const struct socket_bind_rule *rule = bpf_map_lookup_elem(rules, &key); + + /* Lookup returns NULL if iterator is advanced past the last + * element put in the map. */ + if (!rule) + break; + + if (match(ctx->user_family, ctx->protocol, port, rule)) + return true; + } + + return false; +} + +static __always_inline int bind_socket(struct bpf_sock_addr *ctx) { + if (match_rules(ctx, &sd_bind_allow)) + return SOCKET_BIND_ALLOW; + + if (match_rules(ctx, &sd_bind_deny)) + return SOCKET_BIND_DENY; + + return SOCKET_BIND_ALLOW; +} + +SEC("cgroup/bind4") +int sd_bind4(struct bpf_sock_addr *ctx) { + if (ctx->user_family != AF_INET || ctx->family != AF_INET) + return SOCKET_BIND_ALLOW; + + return bind_socket(ctx); +} + +SEC("cgroup/bind6") +int sd_bind6(struct bpf_sock_addr *ctx) { + if (ctx->user_family != AF_INET6 || ctx->family != AF_INET6) + return SOCKET_BIND_ALLOW; + + return bind_socket(ctx); +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/core/cgroup.c b/src/core/cgroup.c new file mode 100644 index 0000000..61ac4df --- /dev/null +++ b/src/core/cgroup.c @@ -0,0 +1,4665 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-messages.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "blockdev-util.h" +#include "bpf-devices.h" +#include "bpf-firewall.h" +#include "bpf-foreign.h" +#include "bpf-socket-bind.h" +#include "btrfs-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "firewall-util.h" +#include "in-addr-prefix-util.h" +#include "inotify-util.h" +#include "io-util.h" +#include "ip-protocol-list.h" +#include "limits-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "restrict-ifaces.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "virt.h" + +#if BPF_FRAMEWORK +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf/restrict_fs/restrict-fs-skel.h" +#endif + +#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) + +/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access + * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask + * out specific attributes from us. */ +#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING) + +uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) { + if (tasks_max->scale == 0) + return tasks_max->value; + + return system_tasks_max_scale(tasks_max->value, tasks_max->scale); +} + +bool manager_owns_host_root_cgroup(Manager *m) { + assert(m); + + /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the + * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's + * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if + * we run in any kind of container virtualization. */ + + if (MANAGER_IS_USER(m)) + return false; + + if (detect_container() > 0) + return false; + + return empty_or_root(m->cgroup_root); +} + +bool unit_has_startup_cgroup_constraints(Unit *u) { + assert(u); + + /* Returns true if this unit has any directives which apply during + * startup/shutdown phases. */ + + CGroupContext *c; + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID || + c->startup_io_weight != CGROUP_WEIGHT_INVALID || + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->startup_cpuset_cpus.set || + c->startup_cpuset_mems.set || + c->startup_memory_high_set || + c->startup_memory_max_set || + c->startup_memory_swap_max_set|| + c->startup_memory_zswap_max_set || + c->startup_memory_low_set; +} + +bool unit_has_host_root_cgroup(Unit *u) { + assert(u); + + /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and + * the manager manages the root cgroup. */ + + if (!manager_owns_host_root_cgroup(u->manager)) + return false; + + return unit_has_name(u, SPECIAL_ROOT_SLICE); +} + +static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) { + int r; + + r = cg_set_attribute(controller, u->cgroup_path, attribute, value); + if (r < 0) + log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m", + strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value); + + return r; +} + +static void cgroup_compat_warn(void) { + static bool cgroup_compat_warned = false; + + if (cgroup_compat_warned) + return; + + log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. " + "See cgroup-compat debug messages for details."); + + cgroup_compat_warned = true; +} + +#define log_cgroup_compat(unit, fmt, ...) do { \ + cgroup_compat_warn(); \ + log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \ + } while (false) + +void cgroup_context_init(CGroupContext *c) { + assert(c); + + /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make + * sure to serialize in execute-serialize.c using serialize_bool() instead of + * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide() + * skips serialization if the value is 'false' (as that's the common default), so if the value at + * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other + * values, update/add a conditional serialization check. This is to minimize the amount of + * serialized data that is sent to the sd-executor, so that there is less work to do on the default + * cases. */ + + *c = (CGroupContext) { + .cpu_weight = CGROUP_WEIGHT_INVALID, + .startup_cpu_weight = CGROUP_WEIGHT_INVALID, + .cpu_quota_per_sec_usec = USEC_INFINITY, + .cpu_quota_period_usec = USEC_INFINITY, + + .cpu_shares = CGROUP_CPU_SHARES_INVALID, + .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID, + + .memory_high = CGROUP_LIMIT_MAX, + .startup_memory_high = CGROUP_LIMIT_MAX, + .memory_max = CGROUP_LIMIT_MAX, + .startup_memory_max = CGROUP_LIMIT_MAX, + .memory_swap_max = CGROUP_LIMIT_MAX, + .startup_memory_swap_max = CGROUP_LIMIT_MAX, + .memory_zswap_max = CGROUP_LIMIT_MAX, + .startup_memory_zswap_max = CGROUP_LIMIT_MAX, + + .memory_limit = CGROUP_LIMIT_MAX, + + .io_weight = CGROUP_WEIGHT_INVALID, + .startup_io_weight = CGROUP_WEIGHT_INVALID, + + .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + + .tasks_max = CGROUP_TASKS_MAX_UNSET, + + .moom_swap = MANAGED_OOM_AUTO, + .moom_mem_pressure = MANAGED_OOM_AUTO, + .moom_preference = MANAGED_OOM_PREFERENCE_NONE, + + .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID, + .memory_pressure_threshold_usec = USEC_INFINITY, + }; +} + +void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) { + assert(c); + assert(a); + + LIST_REMOVE(device_allow, c->device_allow, a); + free(a->path); + free(a); +} + +void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) { + assert(c); + assert(w); + + LIST_REMOVE(device_weights, c->io_device_weights, w); + free(w->path); + free(w); +} + +void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) { + assert(c); + assert(l); + + LIST_REMOVE(device_latencies, c->io_device_latencies, l); + free(l->path); + free(l); +} + +void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) { + assert(c); + assert(l); + + LIST_REMOVE(device_limits, c->io_device_limits, l); + free(l->path); + free(l); +} + +void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) { + assert(c); + assert(w); + + LIST_REMOVE(device_weights, c->blockio_device_weights, w); + free(w->path); + free(w); +} + +void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) { + assert(c); + assert(b); + + LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b); + free(b->path); + free(b); +} + +void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) { + assert(c); + assert(p); + + LIST_REMOVE(programs, c->bpf_foreign_programs, p); + free(p->bpffs_path); + free(p); +} + +void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) { + assert(head); + + LIST_CLEAR(socket_bind_items, *head, free); +} + +void cgroup_context_done(CGroupContext *c) { + assert(c); + + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + while (c->io_device_limits) + cgroup_context_free_io_device_limit(c, c->io_device_limits); + + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + while (c->blockio_device_bandwidths) + cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths); + + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + cgroup_context_remove_socket_bind(&c->socket_bind_allow); + cgroup_context_remove_socket_bind(&c->socket_bind_deny); + + c->ip_address_allow = set_free(c->ip_address_allow); + c->ip_address_deny = set_free(c->ip_address_deny); + + c->ip_filters_ingress = strv_free(c->ip_filters_ingress); + c->ip_filters_egress = strv_free(c->ip_filters_egress); + + while (c->bpf_foreign_programs) + cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs); + + c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces); + + cpu_set_reset(&c->cpuset_cpus); + cpu_set_reset(&c->startup_cpuset_cpus); + cpu_set_reset(&c->cpuset_mems); + cpu_set_reset(&c->startup_cpuset_mems); + + c->delegate_subgroup = mfree(c->delegate_subgroup); + + nft_set_context_clear(&c->nft_set_context); +} + +static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { + assert(u); + + if (!u->cgroup_realized) + return -EOWNERDEAD; + + return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret); +} + +static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) { + CGroupContext *c; + CGroupMask m; + const char *file; + uint64_t unit_value; + int r; + + /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will + * return -ENODATA) on cgroup v1. + * + * Returns: + * + * <0: On error. + * 0: If the kernel memory setting doesn't match our configuration. + * >0: If the kernel memory setting matches our configuration. + * + * The following values are only guaranteed to be populated on return >=0: + * + * - ret_unit_value will contain our internal expected value for the unit, page-aligned. + * - ret_kernel_value will contain the actual value presented by the kernel. */ + + assert(u); + + r = cg_all_unified(); + if (r < 0) + return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m"); + + /* Unsupported on v1. + * + * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has + * silently masked the controller. */ + if (r == 0) + return -ENODATA; + + /* The root slice doesn't have any controller files, so we can't compare anything. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return -ENODATA; + + /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled, + * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To + * avoid specious errors in these scenarios, check that we even expect the memory controller to be + * enabled at all. */ + m = unit_get_target_mask(u); + if (!FLAGS_SET(m, CGROUP_MASK_MEMORY)) + return -ENODATA; + + assert_se(c = unit_get_cgroup_context(u)); + + bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING); + + if (streq(property_name, "MemoryLow")) { + unit_value = unit_get_ancestor_memory_low(u); + file = "memory.low"; + } else if (startup && streq(property_name, "StartupMemoryLow")) { + unit_value = unit_get_ancestor_startup_memory_low(u); + file = "memory.low"; + } else if (streq(property_name, "MemoryMin")) { + unit_value = unit_get_ancestor_memory_min(u); + file = "memory.min"; + } else if (streq(property_name, "MemoryHigh")) { + unit_value = c->memory_high; + file = "memory.high"; + } else if (startup && streq(property_name, "StartupMemoryHigh")) { + unit_value = c->startup_memory_high; + file = "memory.high"; + } else if (streq(property_name, "MemoryMax")) { + unit_value = c->memory_max; + file = "memory.max"; + } else if (startup && streq(property_name, "StartupMemoryMax")) { + unit_value = c->startup_memory_max; + file = "memory.max"; + } else if (streq(property_name, "MemorySwapMax")) { + unit_value = c->memory_swap_max; + file = "memory.swap.max"; + } else if (startup && streq(property_name, "StartupMemorySwapMax")) { + unit_value = c->startup_memory_swap_max; + file = "memory.swap.max"; + } else if (streq(property_name, "MemoryZSwapMax")) { + unit_value = c->memory_zswap_max; + file = "memory.zswap.max"; + } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) { + unit_value = c->startup_memory_zswap_max; + file = "memory.zswap.max"; + } else + return -EINVAL; + + r = unit_get_kernel_memory_limit(u, file, ret_kernel_value); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file); + + /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page + * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from + * our internal page-counting. To support those future kernels, just check the value itself first + * without any page-alignment. */ + if (*ret_kernel_value == unit_value) { + *ret_unit_value = unit_value; + return 1; + } + + /* The current kernel behaviour, by comparison, is that even if you write a particular number of + * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel + * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is + * cricket. */ + if (unit_value != CGROUP_LIMIT_MAX) + unit_value = PAGE_ALIGN_DOWN(unit_value); + + *ret_unit_value = unit_value; + + return *ret_kernel_value == *ret_unit_value; +} + +#define FORMAT_CGROUP_DIFF_MAX 128 + +static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) { + uint64_t kval, sval; + int r; + + assert(u); + assert(buf); + assert(l > 0); + + r = unit_compare_memory_limit(u, property_name, &sval, &kval); + + /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1). + * In the absence of reliably being able to detect whether memcg swap support is available or not, + * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying + * on CONFIG_ZSWAP. */ + if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) || + (r == -ENOENT && STR_IN_SET(property_name, + "MemorySwapMax", + "StartupMemorySwapMax", + "MemoryZSwapMax", + "StartupMemoryZSwapMax"))) + buf[0] = 0; + else if (r < 0) { + errno = -r; + (void) snprintf(buf, l, " (error getting kernel value: %m)"); + } else + (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval); + + return buf; +} + +const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) { + static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = { + /* Lets simply define a table with every possible combination. As long as those are just 8 we + * can get away with it. If this ever grows to more we need to revisit this logic though. */ + [0] = "", + [CGROUP_DEVICE_READ] = "r", + [CGROUP_DEVICE_WRITE] = "w", + [CGROUP_DEVICE_MKNOD] = "m", + [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE] = "rw", + [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD] = "rm", + [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "wm", + [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm", + }; + + if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX) + return NULL; + + return table[p]; +} + +CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) { + CGroupDevicePermissions p = 0; + + if (!s) + return _CGROUP_DEVICE_PERMISSIONS_INVALID; + + for (const char *c = s; *c; c++) { + if (*c == 'r') + p |= CGROUP_DEVICE_READ; + else if (*c == 'w') + p |= CGROUP_DEVICE_WRITE; + else if (*c == 'm') + p |= CGROUP_DEVICE_MKNOD; + else + return _CGROUP_DEVICE_PERMISSIONS_INVALID; + } + + return p; +} + +void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL; + CGroupContext *c; + struct in_addr_prefix *iaai; + + char cda[FORMAT_CGROUP_DIFF_MAX]; + char cdb[FORMAT_CGROUP_DIFF_MAX]; + char cdc[FORMAT_CGROUP_DIFF_MAX]; + char cdd[FORMAT_CGROUP_DIFF_MAX]; + char cde[FORMAT_CGROUP_DIFF_MAX]; + char cdf[FORMAT_CGROUP_DIFF_MAX]; + char cdg[FORMAT_CGROUP_DIFF_MAX]; + char cdh[FORMAT_CGROUP_DIFF_MAX]; + char cdi[FORMAT_CGROUP_DIFF_MAX]; + char cdj[FORMAT_CGROUP_DIFF_MAX]; + char cdk[FORMAT_CGROUP_DIFF_MAX]; + + assert(u); + assert(f); + + assert_se(c = unit_get_cgroup_context(u)); + + prefix = strempty(prefix); + + (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str); + (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str); + + /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */ + const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no"; + + cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus); + startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus); + cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems); + startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems); + + fprintf(f, + "%sCPUAccounting: %s\n" + "%sIOAccounting: %s\n" + "%sBlockIOAccounting: %s\n" + "%sMemoryAccounting: %s\n" + "%sTasksAccounting: %s\n" + "%sIPAccounting: %s\n" + "%sCPUWeight: %" PRIu64 "\n" + "%sStartupCPUWeight: %" PRIu64 "\n" + "%sCPUShares: %" PRIu64 "\n" + "%sStartupCPUShares: %" PRIu64 "\n" + "%sCPUQuotaPerSecSec: %s\n" + "%sCPUQuotaPeriodSec: %s\n" + "%sAllowedCPUs: %s\n" + "%sStartupAllowedCPUs: %s\n" + "%sAllowedMemoryNodes: %s\n" + "%sStartupAllowedMemoryNodes: %s\n" + "%sIOWeight: %" PRIu64 "\n" + "%sStartupIOWeight: %" PRIu64 "\n" + "%sBlockIOWeight: %" PRIu64 "\n" + "%sStartupBlockIOWeight: %" PRIu64 "\n" + "%sDefaultMemoryMin: %" PRIu64 "\n" + "%sDefaultMemoryLow: %" PRIu64 "\n" + "%sMemoryMin: %" PRIu64 "%s\n" + "%sMemoryLow: %" PRIu64 "%s\n" + "%sStartupMemoryLow: %" PRIu64 "%s\n" + "%sMemoryHigh: %" PRIu64 "%s\n" + "%sStartupMemoryHigh: %" PRIu64 "%s\n" + "%sMemoryMax: %" PRIu64 "%s\n" + "%sStartupMemoryMax: %" PRIu64 "%s\n" + "%sMemorySwapMax: %" PRIu64 "%s\n" + "%sStartupMemorySwapMax: %" PRIu64 "%s\n" + "%sMemoryZSwapMax: %" PRIu64 "%s\n" + "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n" + "%sMemoryLimit: %" PRIu64 "\n" + "%sTasksMax: %" PRIu64 "\n" + "%sDevicePolicy: %s\n" + "%sDisableControllers: %s\n" + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" + "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" + "%sManagedOOMPreference: %s\n" + "%sMemoryPressureWatch: %s\n" + "%sCoredumpReceive: %s\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), + prefix, yes_no(c->memory_accounting), + prefix, yes_no(c->tasks_accounting), + prefix, yes_no(c->ip_accounting), + prefix, c->cpu_weight, + prefix, c->startup_cpu_weight, + prefix, c->cpu_shares, + prefix, c->startup_cpu_shares, + prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1), + prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1), + prefix, strempty(cpuset_cpus), + prefix, strempty(startup_cpuset_cpus), + prefix, strempty(cpuset_mems), + prefix, strempty(startup_cpuset_mems), + prefix, c->io_weight, + prefix, c->startup_io_weight, + prefix, c->blockio_weight, + prefix, c->startup_blockio_weight, + prefix, c->default_memory_min, + prefix, c->default_memory_low, + prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"), + prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"), + prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "StartupMemoryLow"), + prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryHigh"), + prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "StartupMemoryHigh"), + prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdf, sizeof(cdf), u, "MemoryMax"), + prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(cdg, sizeof(cdg), u, "StartupMemoryMax"), + prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cdh, sizeof(cdh), u, "MemorySwapMax"), + prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(cdi, sizeof(cdi), u, "StartupMemorySwapMax"), + prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(cdj, sizeof(cdj), u, "MemoryZSwapMax"), + prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(cdk, sizeof(cdk), u, "StartupMemoryZSwapMax"), + prefix, c->memory_limit, + prefix, cgroup_tasks_max_resolve(&c->tasks_max), + prefix, cgroup_device_policy_to_string(c->device_policy), + prefix, strempty(disable_controllers_str), + prefix, delegate_str, + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), + prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)), + prefix, managed_oom_preference_to_string(c->moom_preference), + prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch), + prefix, yes_no(c->coredump_receive)); + + if (c->delegate_subgroup) + fprintf(f, "%sDelegateSubgroup: %s\n", + prefix, c->delegate_subgroup); + + if (c->memory_pressure_threshold_usec != USEC_INFINITY) + fprintf(f, "%sMemoryPressureThresholdSec: %s\n", + prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); + + LIST_FOREACH(device_allow, a, c->device_allow) + /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */ + fprintf(f, + "%sDeviceAllow: %s %s\n", + prefix, + a->path, + strna(cgroup_device_permissions_to_string(a->permissions))); + + LIST_FOREACH(device_weights, iw, c->io_device_weights) + fprintf(f, + "%sIODeviceWeight: %s %" PRIu64 "\n", + prefix, + iw->path, + iw->weight); + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) + fprintf(f, + "%sIODeviceLatencyTargetSec: %s %s\n", + prefix, + l->path, + FORMAT_TIMESPAN(l->target_usec, 1)); + + LIST_FOREACH(device_limits, il, c->io_device_limits) + for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + if (il->limits[type] != cgroup_io_limit_defaults[type]) + fprintf(f, + "%s%s: %s %s\n", + prefix, + cgroup_io_limit_type_to_string(type), + il->path, + FORMAT_BYTES(il->limits[type])); + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) + fprintf(f, + "%sBlockIODeviceWeight: %s %" PRIu64, + prefix, + w->path, + w->weight); + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + if (b->rbps != CGROUP_LIMIT_MAX) + fprintf(f, + "%sBlockIOReadBandwidth: %s %s\n", + prefix, + b->path, + FORMAT_BYTES(b->rbps)); + if (b->wbps != CGROUP_LIMIT_MAX) + fprintf(f, + "%sBlockIOWriteBandwidth: %s %s\n", + prefix, + b->path, + FORMAT_BYTES(b->wbps)); + } + + SET_FOREACH(iaai, c->ip_address_allow) + fprintf(f, "%sIPAddressAllow: %s\n", prefix, + IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen)); + SET_FOREACH(iaai, c->ip_address_deny) + fprintf(f, "%sIPAddressDeny: %s\n", prefix, + IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen)); + + STRV_FOREACH(path, c->ip_filters_ingress) + fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path); + STRV_FOREACH(path, c->ip_filters_egress) + fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path); + + LIST_FOREACH(programs, p, c->bpf_foreign_programs) + fprintf(f, "%sBPFProgram: %s:%s", + prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path); + + if (c->socket_bind_allow) { + fprintf(f, "%sSocketBindAllow: ", prefix); + cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f); + fputc('\n', f); + } + + if (c->socket_bind_deny) { + fprintf(f, "%sSocketBindDeny: ", prefix); + cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f); + fputc('\n', f); + } + + if (c->restrict_network_interfaces) { + char *iface; + SET_FOREACH(iface, c->restrict_network_interfaces) + fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface); + } + + FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) + fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source), + nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set); +} + +void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) { + const char *family, *colon1, *protocol = "", *colon2 = ""; + + family = strempty(af_to_ipv4_ipv6(item->address_family)); + colon1 = isempty(family) ? "" : ":"; + + if (item->ip_protocol != 0) { + protocol = ip_protocol_to_tcp_udp(item->ip_protocol); + colon2 = ":"; + } + + if (item->nr_ports == 0) + fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2); + else if (item->nr_ports == 1) + fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min); + else { + uint16_t port_max = item->port_min + item->nr_ports - 1; + fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2, + item->port_min, port_max); + } +} + +void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) { + bool first = true; + + LIST_FOREACH(socket_bind_items, bi, items) { + if (first) + first = false; + else + fputc(' ', f); + + cgroup_context_dump_socket_bind_item(bi, f); + } +} + +int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) { + _cleanup_free_ CGroupDeviceAllow *a = NULL; + _cleanup_free_ char *d = NULL; + + assert(c); + assert(dev); + assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX); + + if (p == 0) + p = _CGROUP_DEVICE_PERMISSIONS_ALL; + + a = new(CGroupDeviceAllow, 1); + if (!a) + return -ENOMEM; + + d = strdup(dev); + if (!d) + return -ENOMEM; + + *a = (CGroupDeviceAllow) { + .path = TAKE_PTR(d), + .permissions = p, + }; + + LIST_PREPEND(device_allow, c->device_allow, a); + TAKE_PTR(a); + + return 0; +} + +int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) { + assert(c); + assert(dev); + assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX); + + if (p == 0) + p = _CGROUP_DEVICE_PERMISSIONS_ALL; + + LIST_FOREACH(device_allow, b, c->device_allow) + if (path_equal(b->path, dev)) { + b->permissions = p; + return 0; + } + + return cgroup_context_add_device_allow(c, dev, p); +} + +int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) { + CGroupBPFForeignProgram *p; + _cleanup_free_ char *d = NULL; + + assert(c); + assert(bpffs_path); + + if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m"); + + d = strdup(bpffs_path); + if (!d) + return log_oom(); + + p = new(CGroupBPFForeignProgram, 1); + if (!p) + return log_oom(); + + *p = (CGroupBPFForeignProgram) { + .attach_type = attach_type, + .bpffs_path = TAKE_PTR(d), + }; + + LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p)); + + return 0; +} + +#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \ + uint64_t unit_get_ancestor_##entry(Unit *u) { \ + CGroupContext *c; \ + \ + /* 1. Is entry set in this unit? If so, use that. \ + * 2. Is the default for this entry set in any \ + * ancestor? If so, use that. \ + * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \ + \ + assert(u); \ + \ + c = unit_get_cgroup_context(u); \ + if (c && c->entry##_set) \ + return c->entry; \ + \ + while ((u = UNIT_GET_SLICE(u))) { \ + c = unit_get_cgroup_context(u); \ + if (c && c->default_##entry##_set) \ + return c->default_##entry; \ + } \ + \ + /* We've reached the root, but nobody had default for \ + * this entry set, so set it to the kernel default. */ \ + return CGROUP_LIMIT_MIN; \ +} + +UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low); +UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low); +UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min); + +static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) { + int r; + + assert(u); + assert(name); + + if (!u->cgroup_path) + return; + + r = cg_set_xattr(u->cgroup_path, name, data, size, 0); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path)); +} + +static void unit_remove_xattr_graceful(Unit *u, const char *name) { + int r; + + assert(u); + assert(name); + + if (!u->cgroup_path) + return; + + r = cg_remove_xattr(u->cgroup_path, name); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path)); +} + +static void cgroup_oomd_xattr_apply(Unit *u) { + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return; + + if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT) + unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1); + + if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID) + unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1); + + if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID) + unit_remove_xattr_graceful(u, "user.oomd_avoid"); + + if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT) + unit_remove_xattr_graceful(u, "user.oomd_omit"); +} + +static int cgroup_log_xattr_apply(Unit *u) { + ExecContext *c; + size_t len, allowed_patterns_len, denied_patterns_len; + _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL; + char *last; + int r; + + assert(u); + + c = unit_get_exec_context(u); + if (!c) + /* Some unit types have a cgroup context but no exec context, so we do not log + * any error here to avoid confusion. */ + return 0; + + if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) { + unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns"); + return 0; + } + + r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len); + if (r < 0) + return log_debug_errno(r, "Failed to make nulstr from set: %m"); + + r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len); + if (r < 0) + return log_debug_errno(r, "Failed to make nulstr from set: %m"); + + /* Use nul character separated strings without trailing nul */ + allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u); + denied_patterns_len = LESS_BY(denied_patterns_len, 1u); + + len = allowed_patterns_len + 1 + denied_patterns_len; + patterns = new(char, len); + if (!patterns) + return log_oom_debug(); + + last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len); + *(last++) = '\xff'; + memcpy_safe(last, denied_patterns, denied_patterns_len); + + unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len); + + return 0; +} + +static void cgroup_invocation_id_xattr_apply(Unit *u) { + bool b; + + assert(u); + + b = !sd_id128_is_null(u->invocation_id); + FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") { + if (b) + unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32); + else + unit_remove_xattr_graceful(u, xn); + } +} + +static void cgroup_coredump_xattr_apply(Unit *u) { + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return; + + if (unit_cgroup_delegate(u) && c->coredump_receive) + unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1); + else + unit_remove_xattr_graceful(u, "user.coredump_receive"); +} + +static void cgroup_delegate_xattr_apply(Unit *u) { + bool b; + + assert(u); + + /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels + * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs, + * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and + * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well, + * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker + * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't + * be a big problem given this communicates delegation state to clients, but the manager never reads + * it. */ + b = unit_cgroup_delegate(u); + FOREACH_STRING(xn, "trusted.delegate", "user.delegate") { + if (b) + unit_set_xattr_graceful(u, xn, "1", 1); + else + unit_remove_xattr_graceful(u, xn); + } +} + +static void cgroup_survive_xattr_apply(Unit *u) { + int r; + + assert(u); + + if (u->survive_final_kill_signal) { + r = cg_set_xattr( + u->cgroup_path, + "user.survive_final_kill_signal", + "1", + 1, + /* flags= */ 0); + /* user xattr support was added in kernel v5.7 */ + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + r = cg_set_xattr( + u->cgroup_path, + "trusted.survive_final_kill_signal", + "1", + 1, + /* flags= */ 0); + if (r < 0) + log_unit_debug_errno(u, + r, + "Failed to set 'survive_final_kill_signal' xattr on control " + "group %s, ignoring: %m", + empty_to_root(u->cgroup_path)); + } else { + unit_remove_xattr_graceful(u, "user.survive_final_kill_signal"); + unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal"); + } +} + +static void cgroup_xattr_apply(Unit *u) { + assert(u); + + /* The 'user.*' xattrs can be set from a user manager. */ + cgroup_oomd_xattr_apply(u); + cgroup_log_xattr_apply(u); + cgroup_coredump_xattr_apply(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + cgroup_invocation_id_xattr_apply(u); + cgroup_delegate_xattr_apply(u); + cgroup_survive_xattr_apply(u); +} + +static int lookup_block_device(const char *p, dev_t *ret) { + dev_t rdev, dev = 0; + mode_t mode; + int r; + + assert(p); + assert(ret); + + r = device_path_parse_major_minor(p, &mode, &rdev); + if (r == -ENODEV) { /* not a parsable device node, need to go to disk */ + struct stat st; + + if (stat(p, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device '%s': %m", p); + + mode = st.st_mode; + rdev = st.st_rdev; + dev = st.st_dev; + } else if (r < 0) + return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p); + + if (S_ISCHR(mode)) + return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK), + "Device node '%s' is a character device, but block device needed.", p); + if (S_ISBLK(mode)) + *ret = rdev; + else if (major(dev) != 0) + *ret = dev; /* If this is not a device node then use the block device this file is stored on */ + else { + /* If this is btrfs, getting the backing block device is a bit harder */ + r = btrfs_get_block_device(p, ret); + if (r == -ENOTTY) + return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), + "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p); + if (r < 0) + return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p); + } + + /* If this is a LUKS/DM device, recursively try to get the originating block device */ + while (block_get_originating(*ret, ret) > 0); + + /* If this is a partition, try to get the originating block device */ + (void) block_get_whole_disk(*ret, ret); + return 0; +} + +static bool cgroup_context_has_cpu_weight(CGroupContext *c) { + return c->cpu_weight != CGROUP_WEIGHT_INVALID || + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID; +} + +static bool cgroup_context_has_cpu_shares(CGroupContext *c) { + return c->cpu_shares != CGROUP_CPU_SHARES_INVALID || + c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID; +} + +static bool cgroup_context_has_allowed_cpus(CGroupContext *c) { + return c->cpuset_cpus.set || c->startup_cpuset_cpus.set; +} + +static bool cgroup_context_has_allowed_mems(CGroupContext *c) { + return c->cpuset_mems.set || c->startup_cpuset_mems.set; +} + +uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) { + assert(c); + + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) && + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID) + return c->startup_cpu_weight; + else if (c->cpu_weight != CGROUP_WEIGHT_INVALID) + return c->cpu_weight; + else + return CGROUP_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) && + c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID) + return c->startup_cpu_shares; + else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID) + return c->cpu_shares; + else + return CGROUP_CPU_SHARES_DEFAULT; +} + +static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) && + c->startup_cpuset_cpus.set) + return &c->startup_cpuset_cpus; + else + return &c->cpuset_cpus; +} + +static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) && + c->startup_cpuset_mems.set) + return &c->startup_cpuset_mems; + else + return &c->cpuset_mems; +} + +usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) { + /* kernel uses a minimum resolution of 1ms, so both period and (quota * period) + * need to be higher than that boundary. quota is specified in USecPerSec. + * Additionally, period must be at most max_period. */ + assert(quota > 0); + + return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period); +} + +static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) { + usec_t new_period; + + if (quota == USEC_INFINITY) + /* Always use default period for infinity quota. */ + return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; + + if (period == USEC_INFINITY) + /* Default period was requested. */ + period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; + + /* Clamp to interval [1ms, 1s] */ + new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC); + + if (new_period != period) { + log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, + "Clamping CPU interval for cpu.max: period is now %s", + FORMAT_TIMESPAN(new_period, 1)); + u->warned_clamping_cpu_quota_period = true; + } + + return new_period; +} + +static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + if (weight == CGROUP_WEIGHT_IDLE) + return; + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf); +} + +static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) { + int r; + bool is_idle; + const char *idle_val; + + is_idle = weight == CGROUP_WEIGHT_IDLE; + idle_val = one_zero(is_idle); + r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val); + if (r < 0 && (r != -ENOENT || is_idle)) + log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m", + "cpu.idle", empty_to_root(u->cgroup_path), idle_val); +} + +static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) { + char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1]; + + period = cgroup_cpu_adjust_period_and_log(u, period, quota); + if (quota != USEC_INFINITY) + xsprintf(buf, USEC_FMT " " USEC_FMT "\n", + MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period); + else + xsprintf(buf, "max " USEC_FMT "\n", period); + (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf); +} + +static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + xsprintf(buf, "%" PRIu64 "\n", shares); + (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf); +} + +static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) { + char buf[DECIMAL_STR_MAX(usec_t) + 2]; + + period = cgroup_cpu_adjust_period_and_log(u, period, quota); + + xsprintf(buf, USEC_FMT "\n", period); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf); + + if (quota != USEC_INFINITY) { + xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC)); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf); + } else + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n"); +} + +static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) { + return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT, + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) { + /* we don't support idle in cgroupv1 */ + if (weight == CGROUP_WEIGHT_IDLE) + return CGROUP_CPU_SHARES_MIN; + + return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT, + CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX); +} + +static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) { + _cleanup_free_ char *buf = NULL; + + buf = cpu_set_to_range_string(cpus); + if (!buf) { + log_oom(); + return; + } + + (void) set_attribute_and_warn(u, "cpuset", name, buf); +} + +static bool cgroup_context_has_io_config(CGroupContext *c) { + return c->io_accounting || + c->io_weight != CGROUP_WEIGHT_INVALID || + c->startup_io_weight != CGROUP_WEIGHT_INVALID || + c->io_device_weights || + c->io_device_latencies || + c->io_device_limits; +} + +static bool cgroup_context_has_blockio_config(CGroupContext *c) { + return c->blockio_accounting || + c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->blockio_device_weights || + c->blockio_device_bandwidths; +} + +static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) && + c->startup_io_weight != CGROUP_WEIGHT_INVALID) + return c->startup_io_weight; + if (c->io_weight != CGROUP_WEIGHT_INVALID) + return c->io_weight; + return CGROUP_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) && + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) + return c->startup_blockio_weight; + if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) + return c->blockio_weight; + return CGROUP_BLKIO_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) { + return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT, + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) { + return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT, + CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX); +} + +static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) { + static const char * const prop_names[] = { + "IOWeight", + "BlockIOWeight", + "IODeviceWeight", + "BlockIODeviceWeight", + }; + static bool warned = false; + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")]; + const char *p; + uint64_t bfq_weight; + int r; + + /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight" + * See also: https://github.com/systemd/systemd/pull/13335 and + * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */ + p = strjoina(controller, ".bfq.weight"); + /* Adjust to kernel range is 1..1000, the default is 100. */ + bfq_weight = BFQ_WEIGHT(io_weight); + + if (major(dev) > 0) + xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight); + else + xsprintf(buf, "%" PRIu64 "\n", bfq_weight); + + r = cg_set_attribute(controller, u->cgroup_path, p, buf); + + /* FIXME: drop this when kernels prior + * 795fe54c2a82 ("bfq: Add per-device weight") v5.4 + * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return + * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */ + if (r == -EINVAL && major(dev) > 0) { + if (!warned) { + log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p); + warned = true; + } + r = -EOPNOTSUPP; /* mask as unconfigured device */ + } else if (r >= 0 && io_weight != bfq_weight) + log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64, + prop_names[2*(major(dev) > 0) + streq(controller, "blkio")], + io_weight, p, bfq_weight); + return r; +} + +static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r, r1, r2; + + if (lookup_block_device(dev_path, &dev) < 0) + return; + + r1 = set_bfq_weight(u, "io", dev, io_weight); + + xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight); + r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf); + + /* Look at the configured device, when both fail, prefer io.weight errno. */ + r = r2 == -EOPNOTSUPP ? r1 : r2; + + if (r < 0) + log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), + r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m", + empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf); +} + +static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), blkio_weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf); +} + +static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + if (target != USEC_INFINITY) + xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target); + else + xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev)); + + (void) set_attribute_and_warn(u, "io", "io.latency", buf); +} + +static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) { + char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)], + buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4]; + dev_t dev; + + if (lookup_block_device(dev_path, &dev) < 0) + return; + + for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + if (limits[type] != cgroup_io_limit_defaults[type]) + xsprintf(limit_bufs[type], "%" PRIu64, limits[type]); + else + xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0"); + + xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev), + limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX], + limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]); + (void) set_attribute_and_warn(u, "io", "io.max", buf); +} + +static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + + if (lookup_block_device(dev_path, &dev) < 0) + return; + + sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), rbps); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf); + + sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), wbps); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf); +} + +static bool unit_has_unified_memory_config(Unit *u) { + CGroupContext *c; + + assert(u); + + assert_se(c = unit_get_cgroup_context(u)); + + return unit_get_ancestor_memory_min(u) > 0 || + unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 || + c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set || + c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set || + c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set || + c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set; +} + +static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n"; + + if (v != CGROUP_LIMIT_MAX) + xsprintf(buf, "%" PRIu64 "\n", v); + + (void) set_attribute_and_warn(u, "memory", file, buf); +} + +static void cgroup_apply_firewall(Unit *u) { + assert(u); + + /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */ + + if (bpf_firewall_compile(u) < 0) + return; + + (void) bpf_firewall_load_custom(u); + (void) bpf_firewall_install(u); +} + +void unit_modify_nft_set(Unit *u, bool add) { + int r; + + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (cg_all_unified() <= 0) + return; + + if (u->cgroup_id == 0) + return; + + if (!u->manager->fw_ctx) { + r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false); + if (r < 0) + return; + + assert(u->manager->fw_ctx); + } + + CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u)); + + FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) { + if (nft_set->source != NFT_SET_SOURCE_CGROUP) + continue; + + uint64_t element = u->cgroup_id; + + r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element)); + if (r < 0) + log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m", + add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id); + else + log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64, + add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id); + } +} + +static void cgroup_apply_socket_bind(Unit *u) { + assert(u); + + (void) bpf_socket_bind_install(u); +} + +static void cgroup_apply_restrict_network_interfaces(Unit *u) { + assert(u); + + (void) restrict_network_interfaces_install(u); +} + +static int cgroup_apply_devices(Unit *u) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + const char *path; + CGroupContext *c; + CGroupDevicePolicy policy; + int r; + + assert_se(c = unit_get_cgroup_context(u)); + assert_se(path = u->cgroup_path); + + policy = c->device_policy; + + if (cg_all_unified() > 0) { + r = bpf_devices_cgroup_init(&prog, policy, c->device_allow); + if (r < 0) + return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m"); + + } else { + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore + * EINVAL here. */ + + if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO) + r = cg_set_attribute("devices", path, "devices.deny", "a"); + else + r = cg_set_attribute("devices", path, "devices.allow", "a"); + if (r < 0) + log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to reset devices.allow/devices.deny: %m"); + } + + bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED || + (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow); + if (allow_list_static) + (void) bpf_devices_allow_list_static(prog, path); + + bool any = allow_list_static; + LIST_FOREACH(device_allow, a, c->device_allow) { + const char *val; + + if (a->permissions == 0) + continue; + + if (path_startswith(a->path, "/dev/")) + r = bpf_devices_allow_list_device(prog, path, a->path, a->permissions); + else if ((val = startswith(a->path, "block-"))) + r = bpf_devices_allow_list_major(prog, path, val, 'b', a->permissions); + else if ((val = startswith(a->path, "char-"))) + r = bpf_devices_allow_list_major(prog, path, val, 'c', a->permissions); + else { + log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path); + continue; + } + + if (r >= 0) + any = true; + } + + if (prog && !any) { + log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter."); + + /* The kernel verifier would reject a program we would build with the normal intro and outro + but no allow-listing rules (outro would contain an unreachable instruction for successful + return). */ + policy = CGROUP_DEVICE_POLICY_STRICT; + } + + r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed); + if (r < 0) { + static bool warned = false; + + log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r, + "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n" + "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n" + "(This warning is only shown for the first loaded unit using device ACL.)", u->id); + + warned = true; + } + return r; +} + +static void set_io_weight(Unit *u, uint64_t weight) { + char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)]; + + assert(u); + + (void) set_bfq_weight(u, "io", makedev(0, 0), weight); + + xsprintf(buf, "default %" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); +} + +static void set_blkio_weight(Unit *u, uint64_t weight) { + char buf[STRLEN("\n")+DECIMAL_STR_MAX(uint64_t)]; + + assert(u); + + (void) set_bfq_weight(u, "blkio", makedev(0, 0), weight); + + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf); +} + +static void cgroup_apply_bpf_foreign_program(Unit *u) { + assert(u); + + (void) bpf_foreign_install(u); +} + +static void cgroup_context_apply( + Unit *u, + CGroupMask apply_mask, + ManagerState state) { + + const char *path; + CGroupContext *c; + bool is_host_root, is_local_root; + int r; + + assert(u); + + /* Nothing to do? Exit early! */ + if (apply_mask == 0) + return; + + /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other + * attributes should only be managed for cgroups further down the tree. */ + is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE); + is_host_root = unit_has_host_root_cgroup(u); + + assert_se(c = unit_get_cgroup_context(u)); + assert_se(path = u->cgroup_path); + + if (is_local_root) /* Make sure we don't try to display messages with an empty path. */ + path = "/"; + + /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container + * then), and missing cgroups, i.e. EROFS and ENOENT. */ + + /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but + * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this + * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of + * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used + * we couldn't even write to them if we wanted to). */ + if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) { + + if (cg_all_unified() > 0) { + uint64_t weight; + + if (cgroup_context_has_cpu_weight(c)) + weight = cgroup_context_cpu_weight(c, state); + else if (cgroup_context_has_cpu_shares(c)) { + uint64_t shares; + + shares = cgroup_context_cpu_shares(c, state); + weight = cgroup_cpu_shares_to_weight(shares); + + log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s", + shares, weight, path); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + cgroup_apply_unified_cpu_idle(u, weight); + cgroup_apply_unified_cpu_weight(u, weight); + cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec); + + } else { + uint64_t shares; + + if (cgroup_context_has_cpu_weight(c)) { + uint64_t weight; + + weight = cgroup_context_cpu_weight(c, state); + shares = cgroup_cpu_weight_to_shares(weight); + + log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s", + weight, shares, path); + } else if (cgroup_context_has_cpu_shares(c)) + shares = cgroup_context_cpu_shares(c, state); + else + shares = CGROUP_CPU_SHARES_DEFAULT; + + cgroup_apply_legacy_cpu_shares(u, shares); + cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec); + } + } + + if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) { + cgroup_apply_unified_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus"); + cgroup_apply_unified_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems"); + } + + /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2 + * controller), and in case of containers we want to leave control of these attributes to the container manager + * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */ + if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) { + bool has_io, has_blockio; + uint64_t weight; + + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + if (has_io) + weight = cgroup_context_io_weight(c, state); + else if (has_blockio) { + uint64_t blkio_weight; + + blkio_weight = cgroup_context_blkio_weight(c, state); + weight = cgroup_weight_blkio_to_io(blkio_weight); + + log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64, + blkio_weight, weight); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + set_io_weight(u, weight); + + if (has_io) { + LIST_FOREACH(device_weights, w, c->io_device_weights) + cgroup_apply_io_device_weight(u, w->path, w->weight); + + LIST_FOREACH(device_limits, limit, c->io_device_limits) + cgroup_apply_io_device_limit(u, limit->path, limit->limits); + + LIST_FOREACH(device_latencies, latency, c->io_device_latencies) + cgroup_apply_io_device_latency(u, latency->path, latency->target_usec); + + } else if (has_blockio) { + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + weight = cgroup_weight_blkio_to_io(w->weight); + + log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_io_device_weight(u, w->path, weight); + } + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; + + for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + limits[type] = cgroup_io_limit_defaults[type]; + + limits[CGROUP_IO_RBPS_MAX] = b->rbps; + limits[CGROUP_IO_WBPS_MAX] = b->wbps; + + log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s", + b->rbps, b->wbps, b->path); + + cgroup_apply_io_device_limit(u, b->path, limits); + } + } + } + + if (apply_mask & CGROUP_MASK_BLKIO) { + bool has_io, has_blockio; + + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be + * left to our container manager, too. */ + if (!is_local_root) { + uint64_t weight; + + if (has_io) { + uint64_t io_weight; + + io_weight = cgroup_context_io_weight(c, state); + weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state)); + + log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64, + io_weight, weight); + } else if (has_blockio) + weight = cgroup_context_blkio_weight(c, state); + else + weight = CGROUP_BLKIO_WEIGHT_DEFAULT; + + set_blkio_weight(u, weight); + + if (has_io) + LIST_FOREACH(device_weights, w, c->io_device_weights) { + weight = cgroup_weight_io_to_blkio(w->weight); + + log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_blkio_device_weight(u, w->path, weight); + } + else if (has_blockio) + LIST_FOREACH(device_weights, w, c->blockio_device_weights) + cgroup_apply_blkio_device_weight(u, w->path, w->weight); + } + + /* The bandwidth limits are something that make sense to be applied to the host's root but not container + * roots, as there we want the container manager to handle it */ + if (is_host_root || !is_local_root) { + if (has_io) + LIST_FOREACH(device_limits, l, c->io_device_limits) { + log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s", + l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path); + + cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]); + } + else if (has_blockio) + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps); + } + } + + /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes' + * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we + * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even + * write to this if we wanted to.) */ + if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) { + + if (cg_all_unified() > 0) { + uint64_t max, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX; + + if (unit_has_unified_memory_config(u)) { + bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING); + + high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high; + max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max; + swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max; + zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max; + } else { + max = c->memory_limit; + + if (max != CGROUP_LIMIT_MAX) + log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max); + } + + cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u)); + cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u)); + cgroup_apply_unified_memory_limit(u, "memory.high", high); + cgroup_apply_unified_memory_limit(u, "memory.max", max); + cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); + cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max); + + (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group)); + + } else { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t val; + + if (unit_has_unified_memory_config(u)) { + val = c->memory_max; + if (val != CGROUP_LIMIT_MAX) + log_cgroup_compat(u, "Applying MemoryMax=%" PRIu64 " as MemoryLimit=", val); + } else + val = c->memory_limit; + + if (val == CGROUP_LIMIT_MAX) + strncpy(buf, "-1\n", sizeof(buf)); + else + xsprintf(buf, "%" PRIu64 "\n", val); + + (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf); + } + } + + /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of + * containers, where we leave this to the manager */ + if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && + (is_host_root || cg_all_unified() > 0 || !is_local_root)) + (void) cgroup_apply_devices(u); + + if (apply_mask & CGROUP_MASK_PIDS) { + + if (is_host_root) { + /* So, the "pids" controller does not expose anything on the root cgroup, in order not to + * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when + * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a + * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take + * exclusive ownership of the sysctls, but we still want to honour things if the user sets + * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit + * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded) + * it also counts. But if the user never set a limit through us (i.e. we are the default of + * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on + * the first time we set a limit. Note that this boolean is flushed out on manager reload, + * which is desirable so that there's an official way to release control of the sysctl from + * systemd: set the limit to unbounded and reload. */ + + if (cgroup_tasks_max_isset(&c->tasks_max)) { + u->manager->sysctl_pid_max_changed = true; + r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max)); + } else if (u->manager->sysctl_pid_max_changed) + r = procfs_tasks_set_limit(TASKS_MAX); + else + r = 0; + if (r < 0) + log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, + "Failed to write to tasks limit sysctls: %m"); + } + + /* The attribute itself is not available on the host root cgroup, and in the container case we want to + * leave it for the container manager. */ + if (!is_local_root) { + if (cgroup_tasks_max_isset(&c->tasks_max)) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + + xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max)); + (void) set_attribute_and_warn(u, "pids", "pids.max", buf); + } else + (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n"); + } + } + + if (apply_mask & CGROUP_MASK_BPF_FIREWALL) + cgroup_apply_firewall(u); + + if (apply_mask & CGROUP_MASK_BPF_FOREIGN) + cgroup_apply_bpf_foreign_program(u); + + if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND) + cgroup_apply_socket_bind(u); + + if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES) + cgroup_apply_restrict_network_interfaces(u); + + unit_modify_nft_set(u, /* add = */ true); +} + +static bool unit_get_needs_bpf_firewall(Unit *u) { + CGroupContext *c; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + if (c->ip_accounting || + !set_isempty(c->ip_address_allow) || + !set_isempty(c->ip_address_deny) || + c->ip_filters_ingress || + c->ip_filters_egress) + return true; + + /* If any parent slice has an IP access list defined, it applies too */ + for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) { + c = unit_get_cgroup_context(p); + if (!c) + return false; + + if (!set_isempty(c->ip_address_allow) || + !set_isempty(c->ip_address_deny)) + return true; + } + + return false; +} + +static bool unit_get_needs_bpf_foreign_program(Unit *u) { + CGroupContext *c; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return !!c->bpf_foreign_programs; +} + +static bool unit_get_needs_socket_bind(Unit *u) { + CGroupContext *c; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return c->socket_bind_allow || c->socket_bind_deny; +} + +static bool unit_get_needs_restrict_network_interfaces(Unit *u) { + CGroupContext *c; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return !set_isempty(c->restrict_network_interfaces); +} + +static CGroupMask unit_get_cgroup_mask(Unit *u) { + CGroupMask mask = 0; + CGroupContext *c; + + assert(u); + + assert_se(c = unit_get_cgroup_context(u)); + + /* Figure out which controllers we need, based on the cgroup context object */ + + if (c->cpu_accounting) + mask |= get_cpu_accounting_mask(); + + if (cgroup_context_has_cpu_weight(c) || + cgroup_context_has_cpu_shares(c) || + c->cpu_quota_per_sec_usec != USEC_INFINITY) + mask |= CGROUP_MASK_CPU; + + if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c)) + mask |= CGROUP_MASK_CPUSET; + + if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c)) + mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + + if (c->memory_accounting || + c->memory_limit != CGROUP_LIMIT_MAX || + unit_has_unified_memory_config(u)) + mask |= CGROUP_MASK_MEMORY; + + if (c->device_allow || + c->device_policy != CGROUP_DEVICE_POLICY_AUTO) + mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES; + + if (c->tasks_accounting || + cgroup_tasks_max_isset(&c->tasks_max)) + mask |= CGROUP_MASK_PIDS; + + return CGROUP_MASK_EXTEND_JOINED(mask); +} + +static CGroupMask unit_get_bpf_mask(Unit *u) { + CGroupMask mask = 0; + + /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children + * too. */ + + if (unit_get_needs_bpf_firewall(u)) + mask |= CGROUP_MASK_BPF_FIREWALL; + + if (unit_get_needs_bpf_foreign_program(u)) + mask |= CGROUP_MASK_BPF_FOREIGN; + + if (unit_get_needs_socket_bind(u)) + mask |= CGROUP_MASK_BPF_SOCKET_BIND; + + if (unit_get_needs_restrict_network_interfaces(u)) + mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES; + + return mask; +} + +CGroupMask unit_get_own_mask(Unit *u) { + CGroupContext *c; + + /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty + * mask, as we shouldn't reflect it in the cgroup hierarchy then. */ + + if (u->load_state != UNIT_LOADED) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u); +} + +CGroupMask unit_get_delegate_mask(Unit *u) { + CGroupContext *c; + + /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the + * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers. + * + * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */ + + if (!unit_cgroup_delegate(u)) + return 0; + + if (cg_all_unified() <= 0) { + ExecContext *e; + + e = unit_get_exec_context(u); + if (e && !exec_context_maintains_privileges(e)) + return 0; + } + + assert_se(c = unit_get_cgroup_context(u)); + return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers); +} + +static CGroupMask unit_get_subtree_mask(Unit *u) { + + /* Returns the mask of this subtree, meaning of the group + * itself and its children. */ + + return unit_get_own_mask(u) | unit_get_members_mask(u); +} + +CGroupMask unit_get_members_mask(Unit *u) { + assert(u); + + /* Returns the mask of controllers all of the unit's children require, merged */ + + if (u->cgroup_members_mask_valid) + return u->cgroup_members_mask; /* Use cached value if possible */ + + u->cgroup_members_mask = 0; + + if (u->type == UNIT_SLICE) { + Unit *member; + + UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF) + u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */ + } + + u->cgroup_members_mask_valid = true; + return u->cgroup_members_mask; +} + +CGroupMask unit_get_siblings_mask(Unit *u) { + Unit *slice; + assert(u); + + /* Returns the mask of controllers all of the unit's siblings + * require, i.e. the members mask of the unit's parent slice + * if there is one. */ + + slice = UNIT_GET_SLICE(u); + if (slice) + return unit_get_members_mask(slice); + + return unit_get_subtree_mask(u); /* we are the top-level slice */ +} + +static CGroupMask unit_get_disable_mask(Unit *u) { + CGroupContext *c; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + return c->disable_controllers; +} + +CGroupMask unit_get_ancestor_disable_mask(Unit *u) { + CGroupMask mask; + Unit *slice; + + assert(u); + mask = unit_get_disable_mask(u); + + /* Returns the mask of controllers which are marked as forcibly + * disabled in any ancestor unit or the unit in question. */ + + slice = UNIT_GET_SLICE(u); + if (slice) + mask |= unit_get_ancestor_disable_mask(slice); + + return mask; +} + +CGroupMask unit_get_target_mask(Unit *u) { + CGroupMask own_mask, mask; + + /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything + * it needs itself, plus all that its children need, plus all that its siblings need. This is + * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each + * hierarchy that shall be enabled for it. */ + + own_mask = unit_get_own_mask(u); + + if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported) + emit_bpf_firewall_warning(u); + + mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u); + + mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); + + return mask; +} + +CGroupMask unit_get_enable_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for the children of a specific cgroup. This is primarily + * useful for the unified cgroup hierarchy, where each cgroup + * controls which controllers are enabled for its children. */ + + mask = unit_get_members_mask(u); + mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); + + return mask; +} + +void unit_invalidate_cgroup_members_masks(Unit *u) { + Unit *slice; + + assert(u); + + /* Recurse invalidate the member masks cache all the way up the tree */ + u->cgroup_members_mask_valid = false; + + slice = UNIT_GET_SLICE(u); + if (slice) + unit_invalidate_cgroup_members_masks(slice); +} + +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { + + /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */ + + while (u) { + + if (u->cgroup_path && + u->cgroup_realized && + FLAGS_SET(u->cgroup_realized_mask, mask)) + return u->cgroup_path; + + u = UNIT_GET_SLICE(u); + } + + return NULL; +} + +static const char *migrate_callback(CGroupMask mask, void *userdata) { + /* If not realized at all, migrate to root (""). + * It may happen if we're upgrading from older version that didn't clean up. + */ + return strempty(unit_get_realized_cgroup_path(userdata, mask)); +} + +int unit_default_cgroup_path(const Unit *u, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + assert(ret); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + p = strdup(u->manager->cgroup_root); + else { + _cleanup_free_ char *escaped = NULL, *slice_path = NULL; + Unit *slice; + + slice = UNIT_GET_SLICE(u); + if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) { + r = cg_slice_to_path(slice->id, &slice_path); + if (r < 0) + return r; + } + + r = cg_escape(u->id, &escaped); + if (r < 0) + return r; + + p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped); + } + if (!p) + return -ENOMEM; + + *ret = TAKE_PTR(p); + return 0; +} + +int unit_set_cgroup_path(Unit *u, const char *path) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + if (streq_ptr(u->cgroup_path, path)) + return 0; + + if (path) { + p = strdup(path); + if (!p) + return -ENOMEM; + } + + if (p) { + r = hashmap_put(u->manager->cgroup_unit, p, u); + if (r < 0) + return r; + } + + unit_release_cgroup(u); + u->cgroup_path = TAKE_PTR(p); + + return 1; +} + +int unit_watch_cgroup(Unit *u) { + _cleanup_free_ char *events = NULL; + int r; + + assert(u); + + /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if + * cgroupv2 is available. */ + + if (!u->cgroup_path) + return 0; + + if (u->cgroup_control_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m"); + if (r == 0) + return 0; + + /* No point in watch the top-level slice, it's never going to run empty. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events); + if (r < 0) + return log_oom(); + + u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_control_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path)); + } + + r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path)); + + return 0; +} + +int unit_watch_cgroup_memory(Unit *u) { + _cleanup_free_ char *events = NULL; + CGroupContext *c; + int r; + + assert(u); + + /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if + * cgroupv2 is available. */ + + if (!u->cgroup_path) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie + * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after + * all. */ + if (!c->memory_accounting) + return 0; + + /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and + * we also don't want to generate a log message for each parent cgroup of a process. */ + if (u->type == UNIT_SLICE) + return 0; + + if (u->cgroup_memory_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m"); + if (r == 0) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events); + if (r < 0) + return log_oom(); + + u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_memory_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path)); + } + + r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path)); + + return 0; +} + +int unit_pick_cgroup_path(Unit *u) { + _cleanup_free_ char *path = NULL; + int r; + + assert(u); + + if (u->cgroup_path) + return 0; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + r = unit_default_cgroup_path(u, &path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m"); + + r = unit_set_cgroup_path(u, path); + if (r == -EEXIST) + return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path)); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path)); + + return 0; +} + +static int unit_update_cgroup( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask, + ManagerState state) { + + bool created, is_root_slice; + CGroupMask migrate_mask = 0; + _cleanup_free_ char *cgroup_full_path = NULL; + int r; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* Figure out our cgroup path */ + r = unit_pick_cgroup_path(u); + if (r < 0) + return r; + + /* First, create our own group */ + r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path)); + created = r; + + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { + uint64_t cgroup_id = 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path); + if (r == 0) { + r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id); + if (r < 0) + log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path); + } else + log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path)); + + u->cgroup_id = cgroup_id; + } + + /* Start watching it */ + (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); + + /* For v2 we preserve enabled controllers in delegated units, adjust others, + * for v1 we figure out which controller hierarchies need migration. */ + if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { + CGroupMask result_mask = 0; + + /* Enable all controllers we need */ + r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path)); + + /* Remember what's actually enabled now */ + u->cgroup_enabled_mask = result_mask; + + migrate_mask = u->cgroup_realized_mask ^ target_mask; + } + + /* Keep track that this is now realized */ + u->cgroup_realized = true; + u->cgroup_realized_mask = target_mask; + + /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling). + * + * Unnecessary controller cgroups are trimmed (after emptied by upward migration). + * We perform migration also with whole slices for cases when users don't care about leave + * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing + * delegated units. + */ + if (cg_all_unified() == 0) { + r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path)); + + is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); + r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path)); + } + + /* Set attributes */ + cgroup_context_apply(u, target_mask, state); + cgroup_xattr_apply(u); + + /* For most units we expect that memory monitoring is set up before the unit is started and we won't + * touch it after. For PID 1 this is different though, because we couldn't possibly do that given + * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's + * try to open the memory pressure interface anew. */ + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + (void) manager_setup_memory_pressure_event_source(u->manager); + + return 0; +} + +static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + char *pp; + int r; + + assert(u); + + if (MANAGER_IS_SYSTEM(u->manager)) + return -EINVAL; + + if (!u->manager->system_bus) + return -EIO; + + if (!u->cgroup_path) + return -EINVAL; + + /* Determine this unit's cgroup path relative to our cgroup root */ + pp = path_startswith(u->cgroup_path, u->manager->cgroup_root); + if (!pp) + return -EINVAL; + + pp = strjoina("/", pp, suffix_path); + path_simplify(pp); + + r = bus_call_method(u->manager->system_bus, + bus_systemd_mgr, + "AttachProcessesToUnit", + &error, NULL, + "ssau", + NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r)); + + return 0; +} + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { + _cleanup_free_ char *joined = NULL; + CGroupMask delegated_mask; + const char *p; + PidRef *pid; + int ret, r; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + if (set_isempty(pids)) + return 0; + + /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable. + * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */ + r = bpf_firewall_load_custom(u); + if (r < 0) + return r; + + r = unit_realize_cgroup(u); + if (r < 0) + return r; + + if (isempty(suffix_path)) + p = u->cgroup_path; + else { + joined = path_join(u->cgroup_path, suffix_path); + if (!joined) + return -ENOMEM; + + p = joined; + } + + delegated_mask = unit_get_delegate_mask(u); + + ret = 0; + SET_FOREACH(pid, pids) { + + /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead, + * which of course is racy. Let's shorten the race a bit though, and re-validate the PID + * before we use it */ + r = pidref_verify(pid); + if (r < 0) { + log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p)); + continue; + } + + /* First, attach the PID to the main cgroup hierarchy */ + r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid->pid); + if (r < 0) { + bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(r); + + log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO, r, + "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m", + pid->pid, again ? " directly" : "", empty_to_root(p)); + + if (again) { + int z; + + /* If we are in a user instance, and we can't move the process ourselves due + * to permission problems, let's ask the system instance about it instead. + * Since it's more privileged it might be able to move the process across the + * leaves of a subtree whose top node is not owned by us. */ + + z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path); + if (z < 0) + log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p)); + else { + if (ret >= 0) + ret++; /* Count successful additions */ + continue; /* When the bus thing worked via the bus we are fully done for this PID. */ + } + } + + if (ret >= 0) + ret = r; /* Remember first error */ + + continue; + } else if (ret >= 0) + ret++; /* Count successful additions */ + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + continue; + + /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the + * innermost realized one */ + + for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *realized; + + if (!(u->manager->cgroup_supported & bit)) + continue; + + /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */ + if (delegated_mask & u->cgroup_realized_mask & bit) { + r = cg_attach(cgroup_controller_to_string(c), p, pid->pid); + if (r >= 0) + continue; /* Success! */ + + log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m", + pid->pid, empty_to_root(p), cgroup_controller_to_string(c)); + } + + /* So this controller is either not delegate or realized, or something else weird happened. In + * that case let's attach the PID at least to the closest cgroup up the tree that is + * realized. */ + realized = unit_get_realized_cgroup_path(u, bit); + if (!realized) + continue; /* Not even realized in the root slice? Then let's not bother */ + + r = cg_attach(cgroup_controller_to_string(c), realized, pid->pid); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m", + pid->pid, realized, cgroup_controller_to_string(c)); + } + } + + return ret; +} + +static bool unit_has_mask_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if this unit is fully realized. We check four things: + * + * 1. Whether the cgroup was created at all + * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1) + * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2) + * 4. Whether the invalidation mask is currently zero + * + * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note + * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for + * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask + * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they + * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are + * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they + * simply don't matter. */ + + return u->cgroup_realized && + ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && + ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && + u->cgroup_invalidated_mask == 0; +} + +static bool unit_has_mask_disables_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if all controllers which should be disabled are indeed disabled. + * + * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is + * already removed. */ + + return !u->cgroup_realized || + (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) && + FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2)); +} + +static bool unit_has_mask_enables_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if all controllers which should be enabled are indeed enabled. + * + * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything + * we want to add is already added. */ + + return u->cgroup_realized && + ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) && + ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2); +} + +void unit_add_to_cgroup_realize_queue(Unit *u) { + assert(u); + + if (u->in_cgroup_realize_queue) + return; + + LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + u->in_cgroup_realize_queue = true; +} + +static void unit_remove_from_cgroup_realize_queue(Unit *u) { + assert(u); + + if (!u->in_cgroup_realize_queue) + return; + + LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + u->in_cgroup_realize_queue = false; +} + +/* Controllers can only be enabled breadth-first, from the root of the + * hierarchy downwards to the unit in question. */ +static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) { + CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; + Unit *slice; + int r; + + assert(u); + + /* First go deal with this unit's parent, or we won't be able to enable + * any new controllers at this layer. */ + slice = UNIT_GET_SLICE(u); + if (slice) { + r = unit_realize_cgroup_now_enable(slice, state); + if (r < 0) + return r; + } + + target_mask = unit_get_target_mask(u); + enable_mask = unit_get_enable_mask(u); + + /* We can only enable in this direction, don't try to disable anything. + */ + if (unit_has_mask_enables_realized(u, target_mask, enable_mask)) + return 0; + + new_target_mask = u->cgroup_realized_mask | target_mask; + new_enable_mask = u->cgroup_enabled_mask | enable_mask; + + return unit_update_cgroup(u, new_target_mask, new_enable_mask, state); +} + +/* Controllers can only be disabled depth-first, from the leaves of the + * hierarchy upwards to the unit in question. */ +static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { + Unit *m; + + assert(u); + + if (u->type != UNIT_SLICE) + return 0; + + UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) { + CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; + int r; + + /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't + * holding any controllers open anyway. */ + if (!m->cgroup_realized) + continue; + + /* We must disable those below us first in order to release the controller. */ + if (m->type == UNIT_SLICE) + (void) unit_realize_cgroup_now_disable(m, state); + + target_mask = unit_get_target_mask(m); + enable_mask = unit_get_enable_mask(m); + + /* We can only disable in this direction, don't try to enable anything. */ + if (unit_has_mask_disables_realized(m, target_mask, enable_mask)) + continue; + + new_target_mask = m->cgroup_realized_mask & target_mask; + new_enable_mask = m->cgroup_enabled_mask & enable_mask; + + r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state); + if (r < 0) + return r; + } + + return 0; +} + +/* Check if necessary controllers and attributes for a unit are in place. + * + * - If so, do nothing. + * - If not, create paths, move processes over, and set attributes. + * + * Controllers can only be *enabled* in a breadth-first way, and *disabled* in + * a depth-first way. As such the process looks like this: + * + * Suppose we have a cgroup hierarchy which looks like this: + * + * root + * / \ + * / \ + * / \ + * a b + * / \ / \ + * / \ / \ + * c d e f + * / \ / \ / \ / \ + * h i j k l m n o + * + * 1. We want to realise cgroup "d" now. + * 2. cgroup "a" has DisableControllers=cpu in the associated unit. + * 3. cgroup "k" just started requesting the memory controller. + * + * To make this work we must do the following in order: + * + * 1. Disable CPU controller in k, j + * 2. Disable CPU controller in d + * 3. Enable memory controller in root + * 4. Enable memory controller in a + * 5. Enable memory controller in d + * 6. Enable memory controller in k + * + * Notice that we need to touch j in one direction, but not the other. We also + * don't go beyond d when disabling -- it's up to "a" to get realized if it + * wants to disable further. The basic rules are therefore: + * + * - If you're disabling something, you need to realise all of the cgroups from + * your recursive descendants to the root. This starts from the leaves. + * - If you're enabling something, you need to realise from the root cgroup + * downwards, but you don't need to iterate your recursive descendants. + * + * Returns 0 on success and < 0 on failure. */ +static int unit_realize_cgroup_now(Unit *u, ManagerState state) { + CGroupMask target_mask, enable_mask; + Unit *slice; + int r; + + assert(u); + + unit_remove_from_cgroup_realize_queue(u); + + target_mask = unit_get_target_mask(u); + enable_mask = unit_get_enable_mask(u); + + if (unit_has_mask_realized(u, target_mask, enable_mask)) + return 0; + + /* Disable controllers below us, if there are any */ + r = unit_realize_cgroup_now_disable(u, state); + if (r < 0) + return r; + + /* Enable controllers above us, if there are any */ + slice = UNIT_GET_SLICE(u); + if (slice) { + r = unit_realize_cgroup_now_enable(slice, state); + if (r < 0) + return r; + } + + /* Now actually deal with the cgroup we were trying to realise and set attributes */ + r = unit_update_cgroup(u, target_mask, enable_mask, state); + if (r < 0) + return r; + + /* Now, reset the invalidation mask */ + u->cgroup_invalidated_mask = 0; + return 0; +} + +unsigned manager_dispatch_cgroup_realize_queue(Manager *m) { + ManagerState state; + unsigned n = 0; + Unit *i; + int r; + + assert(m); + + state = manager_state(m); + + while ((i = m->cgroup_realize_queue)) { + assert(i->in_cgroup_realize_queue); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) { + /* Maybe things changed, and the unit is not actually active anymore? */ + unit_remove_from_cgroup_realize_queue(i); + continue; + } + + r = unit_realize_cgroup_now(i, state); + if (r < 0) + log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id); + + n++; + } + + return n; +} + +void unit_add_family_to_cgroup_realize_queue(Unit *u) { + assert(u); + assert(u->type == UNIT_SLICE); + + /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all + * its ancestors. + * + * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes + * very weird if two units that own processes reside in the same slice, but one is realized in the + * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does + * not), because that means individual processes need to be scheduled against whole cgroups. Let's + * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1 + * controller hierarchies too (if unit requires the controller to be realized). + * + * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date + * masks. */ + + do { + Unit *m; + + /* Children of u likely changed when we're called */ + u->cgroup_members_mask_valid = false; + + UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) { + + /* No point in doing cgroup application for units without active processes. */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m))) + continue; + + /* We only enqueue siblings if they were realized once at least, in the main + * hierarchy. */ + if (!m->cgroup_realized) + continue; + + /* If the unit doesn't need any new controllers and has current ones + * realized, it doesn't need any changes. */ + if (unit_has_mask_realized(m, + unit_get_target_mask(m), + unit_get_enable_mask(m))) + continue; + + unit_add_to_cgroup_realize_queue(m); + } + + /* Parent comes after children */ + unit_add_to_cgroup_realize_queue(u); + + u = UNIT_GET_SLICE(u); + } while (u); +} + +int unit_realize_cgroup(Unit *u) { + Unit *slice; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all + * parents, but there's more actually: for the weight-based controllers we also need to make sure + * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the + * other hand, when a controller is removed from realized set, it may become unnecessary in siblings + * and ancestors and they should be (de)realized too. + * + * This call will defer work on the siblings and derealized ancestors to the next event loop + * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */ + + slice = UNIT_GET_SLICE(u); + if (slice) + unit_add_family_to_cgroup_realize_queue(slice); + + /* And realize this one now (and apply the values) */ + return unit_realize_cgroup_now(u, manager_state(u->manager)); +} + +void unit_release_cgroup(Unit *u) { + assert(u); + + /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call + * when we close down everything for reexecution, where we really want to leave the cgroup in place. */ + + if (u->cgroup_path) { + (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); + u->cgroup_path = mfree(u->cgroup_path); + } + + if (u->cgroup_control_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd)); + u->cgroup_control_inotify_wd = -1; + } + + if (u->cgroup_memory_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd)); + u->cgroup_memory_inotify_wd = -1; + } +} + +bool unit_maybe_release_cgroup(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return true; + + /* Don't release the cgroup if there are still processes under it. If we get notified later when all the + * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed) + * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned + * up later. */ + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) + log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m"); + else if (r == 1) { + unit_release_cgroup(u); + return true; + } + + return false; +} + +void unit_prune_cgroup(Unit *u) { + int r; + bool is_root_slice; + + assert(u); + + /* Removes the cgroup, if empty and possible, and stops watching it. */ + + if (!u->cgroup_path) + return; + + (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */ + +#if BPF_FRAMEWORK + (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */ +#endif + + unit_modify_nft_set(u, /* add = */ false); + + is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); + + r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice); + if (r < 0) + /* One reason we could have failed here is, that the cgroup still contains a process. + * However, if the cgroup becomes removable at a later time, it might be removed when + * the containing slice is stopped. So even if we failed now, this unit shouldn't assume + * that the cgroup is still realized the next time it is started. Do not return early + * on error, continue cleanup. */ + log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path)); + + if (is_root_slice) + return; + + if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */ + return; + + u->cgroup_realized = false; + u->cgroup_realized_mask = 0; + u->cgroup_enabled_mask = 0; + + u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed); +} + +int unit_search_main_pid(Unit *u, PidRef *ret) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(u); + assert(ret); + + if (!u->cgroup_path) + return -ENXIO; + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f); + if (r < 0) + return r; + + for (;;) { + _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL; + + r = cg_read_pidref(f, &npidref); + if (r < 0) + return r; + if (r == 0) + break; + + if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */ + continue; + + if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */ + continue; + + if (pidref_is_set(&pidref) != 0) + /* Dang, there's more than one daemonized PID in this group, so we don't know what + * process is the main process. */ + return -ENODATA; + + pidref = TAKE_PIDREF(npidref); + } + + if (!pidref_is_set(&pidref)) + return -ENODATA; + + *ret = TAKE_PIDREF(pidref); + return 0; +} + +static int unit_watch_pids_in_path(Unit *u, const char *path) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int ret = 0, r; + + assert(u); + assert(path); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f); + if (r < 0) + RET_GATHER(ret, r); + else { + for (;;) { + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + + r = cg_read_pidref(f, &pid); + if (r == 0) + break; + if (r < 0) { + RET_GATHER(ret, r); + break; + } + + RET_GATHER(ret, unit_watch_pidref(u, &pid, /* exclusive= */ false)); + } + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) + RET_GATHER(ret, r); + else { + for (;;) { + _cleanup_free_ char *fn = NULL, *p = NULL; + + r = cg_read_subgroup(d, &fn); + if (r == 0) + break; + if (r < 0) { + RET_GATHER(ret, r); + break; + } + + p = path_join(empty_to_root(path), fn); + if (!p) + return -ENOMEM; + + RET_GATHER(ret, unit_watch_pids_in_path(u, p)); + } + } + + return ret; +} + +int unit_synthesize_cgroup_empty_event(Unit *u) { + int r; + + assert(u); + + /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility + * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can + * get as notification source as soon as we stopped having any useful PIDs to watch for. */ + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we have reliable notifications, and don't need this */ + return 0; + + if (!set_isempty(u->pids)) + return 0; + + unit_add_to_cgroup_empty_queue(u); + return 0; +} + +int unit_watch_all_pids(Unit *u) { + int r; + + assert(u); + + /* Adds all PIDs from our cgroup to the set of PIDs we + * watch. This is a fallback logic for cases where we do not + * get reliable cgroup empty notifications: we try to use + * SIGCHLD as replacement. */ + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + return unit_watch_pids_in_path(u, u->cgroup_path); +} + +static int on_cgroup_empty_event(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Unit *u; + int r; + + assert(s); + + u = m->cgroup_empty_queue; + if (!u) + return 0; + + assert(u->in_cgroup_empty_queue); + u->in_cgroup_empty_queue = false; + LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u); + + if (m->cgroup_empty_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m"); + } + + /* Update state based on OOM kills before we notify about cgroup empty event */ + (void) unit_check_oom(u); + (void) unit_check_oomd_kill(u); + + unit_add_to_gc_queue(u); + + if (IN_SET(unit_active_state(u), UNIT_INACTIVE, UNIT_FAILED)) + unit_prune_cgroup(u); + else if (UNIT_VTABLE(u)->notify_cgroup_empty) + UNIT_VTABLE(u)->notify_cgroup_empty(u); + + return 0; +} + +void unit_add_to_cgroup_empty_queue(Unit *u) { + int r; + + assert(u); + + /* Note that there are four different ways how cgroup empty events reach us: + * + * 1. On the unified hierarchy we get an inotify event on the cgroup + * + * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket + * + * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus + * + * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as + * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications. + * + * Regardless which way we got the notification, we'll verify it here, and then add it to a separate + * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use + * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending + * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the + * case for scope units). */ + + if (u->in_cgroup_empty_queue) + return; + + /* Let's verify that the cgroup is really empty */ + if (!u->cgroup_path) + return; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path)); + return; + } + if (r == 0) + return; + + LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + u->in_cgroup_empty_queue = true; + + /* Trigger the defer event */ + r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to enable cgroup empty event source: %m"); +} + +static void unit_remove_from_cgroup_empty_queue(Unit *u) { + assert(u); + + if (!u->in_cgroup_empty_queue) + return; + + LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + u->in_cgroup_empty_queue = false; +} + +int unit_check_oomd_kill(Unit *u) { + _cleanup_free_ char *value = NULL; + bool increased; + uint64_t n = 0; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_all_unified(); + if (r < 0) + return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m"); + else if (r == 0) + return 0; + + r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_ooms", &value); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &n); + if (r < 0) + return r; + } + + increased = n > u->managed_oom_kill_last; + u->managed_oom_kill_last = n; + + if (!increased) + return 0; + + n = 0; + value = mfree(value); + r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_kill", &value); + if (r >= 0 && !isempty(value)) + (void) safe_atou64(value, &n); + + if (n > 0) + log_unit_struct(u, LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n), + "N_PROCESSES=%" PRIu64, n); + else + log_unit_struct(u, LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit.")); + + unit_notify_cgroup_oom(u, /* ManagedOOM= */ true); + + return 1; +} + +int unit_check_oom(Unit *u) { + _cleanup_free_ char *oom_kill = NULL; + bool increased; + uint64_t c; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill); + if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */ + c = 0; + else if (r < 0) + return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m"); + else { + r = safe_atou64(oom_kill, &c); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m"); + } + + increased = c > u->oom_kill_last; + u->oom_kill_last = c; + + if (!increased) + return 0; + + log_unit_struct(u, LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer.")); + + unit_notify_cgroup_oom(u, /* ManagedOOM= */ false); + + return 1; +} + +static int on_cgroup_oom_event(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Unit *u; + int r; + + assert(s); + + u = m->cgroup_oom_queue; + if (!u) + return 0; + + assert(u->in_cgroup_oom_queue); + u->in_cgroup_oom_queue = false; + LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u); + + if (m->cgroup_oom_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m"); + } + + (void) unit_check_oom(u); + unit_add_to_gc_queue(u); + + return 0; +} + +static void unit_add_to_cgroup_oom_queue(Unit *u) { + int r; + + assert(u); + + if (u->in_cgroup_oom_queue) + return; + if (!u->cgroup_path) + return; + + LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u); + u->in_cgroup_oom_queue = true; + + /* Trigger the defer event */ + if (!u->manager->cgroup_oom_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager); + if (r < 0) { + log_error_errno(r, "Failed to create cgroup oom event source: %m"); + return; + } + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) { + log_error_errno(r, "Failed to set priority of cgroup oom event source: %m"); + return; + } + + (void) sd_event_source_set_description(s, "cgroup-oom"); + u->manager->cgroup_oom_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_error_errno(r, "Failed to enable cgroup oom event source: %m"); +} + +static int unit_check_cgroup_events(Unit *u) { + char *values[2] = {}; + int r; + + assert(u); + + if (!u->cgroup_path) + return 0; + + r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", + STRV_MAKE("populated", "frozen"), values); + if (r < 0) + return r; + + /* The cgroup.events notifications can be merged together so act as we saw the given state for the + * first time. The functions we call to handle given state are idempotent, which makes them + * effectively remember the previous state. */ + if (values[0]) { + if (streq(values[0], "1")) + unit_remove_from_cgroup_empty_queue(u); + else + unit_add_to_cgroup_empty_queue(u); + } + + /* Disregard freezer state changes due to operations not initiated by us */ + if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) { + if (streq(values[1], "0")) + unit_thawed(u); + else + unit_frozen(u); + } + + free(values[0]); + free(values[1]); + + return 0; +} + +static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + assert(fd >= 0); + + for (;;) { + union inotify_event_buffer buffer; + ssize_t l; + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return log_error_errno(errno, "Failed to read control group inotify events: %m"); + } + + FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) { + Unit *u; + + if (e->wd < 0) + /* Queue overflow has no watch descriptor */ + continue; + + if (e->mask & IN_IGNORED) + /* The watch was just removed */ + continue; + + /* Note that inotify might deliver events for a watch even after it was removed, + * because it was queued before the removal. Let's ignore this here safely. */ + + u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_check_cgroup_events(u); + + u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_add_to_cgroup_oom_queue(u); + } + } +} + +static int cg_bpf_mask_supported(CGroupMask *ret) { + CGroupMask mask = 0; + int r; + + /* BPF-based firewall */ + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r > 0) + mask |= CGROUP_MASK_BPF_FIREWALL; + + /* BPF-based device access control */ + r = bpf_devices_supported(); + if (r < 0) + return r; + if (r > 0) + mask |= CGROUP_MASK_BPF_DEVICES; + + /* BPF pinned prog */ + r = bpf_foreign_supported(); + if (r < 0) + return r; + if (r > 0) + mask |= CGROUP_MASK_BPF_FOREIGN; + + /* BPF-based bind{4|6} hooks */ + r = bpf_socket_bind_supported(); + if (r < 0) + return r; + if (r > 0) + mask |= CGROUP_MASK_BPF_SOCKET_BIND; + + /* BPF-based cgroup_skb/{egress|ingress} hooks */ + r = restrict_network_interfaces_supported(); + if (r < 0) + return r; + if (r > 0) + mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES; + + *ret = mask; + return 0; +} + +int manager_setup_cgroup(Manager *m) { + _cleanup_free_ char *path = NULL; + const char *scope_path; + int r, all_unified; + CGroupMask mask; + char *e; + + assert(m); + + /* 1. Determine hierarchy */ + m->cgroup_root = mfree(m->cgroup_root); + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root); + if (r < 0) + return log_error_errno(r, "Cannot determine cgroup we are running in: %m"); + + /* Chop off the init scope, if we are already located in it */ + e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + + /* LEGACY: Also chop off the system slice if we are in + * it. This is to support live upgrades from older systemd + * versions where PID 1 was moved there. Also see + * cg_get_root_path(). */ + if (!e && MANAGER_IS_SYSTEM(m)) { + e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE); + if (!e) + e = endswith(m->cgroup_root, "/system"); /* even more legacy */ + } + if (e) + *e = 0; + + /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can + * easily prepend it everywhere. */ + delete_trailing_chars(m->cgroup_root, "/"); + + /* 2. Show data */ + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path); + if (r < 0) + return log_error_errno(r, "Cannot find cgroup mount point: %m"); + + r = cg_unified(); + if (r < 0) + return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m"); + + all_unified = cg_all_unified(); + if (all_unified < 0) + return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m"); + if (all_unified > 0) + log_debug("Unified cgroup hierarchy is located at %s.", path); + else { + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m"); + if (r > 0) + log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path); + else + log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path); + } + + /* 3. Allocate cgroup empty defer event source */ + m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source); + r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m); + if (r < 0) + return log_error_errno(r, "Failed to create cgroup empty event source: %m"); + + /* Schedule cgroup empty checks early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of + * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */ + r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m"); + + r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to disable cgroup empty event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty"); + + /* 4. Install notifier inotify object, or agent */ + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { + + /* In the unified hierarchy we can get cgroup empty notifications via inotify. */ + + m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source); + safe_close(m->cgroup_inotify_fd); + + m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->cgroup_inotify_fd < 0) + return log_error_errno(errno, "Failed to create control group inotify object: %m"); + + r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m); + if (r < 0) + return log_error_errno(r, "Failed to watch control group inotify object: %m"); + + /* Process cgroup empty notifications early. Note that when this event is dispatched it'll + * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see + * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9); + if (r < 0) + return log_error_errno(r, "Failed to set priority of inotify event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); + + } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) { + + /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable, + * since it does not generate events when control groups with children run empty. */ + + r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH); + if (r < 0) + log_warning_errno(r, "Failed to install release agent, ignoring: %m"); + else if (r > 0) + log_debug("Installed release agent."); + else if (r == 0) + log_debug("Release agent already installed."); + } + + /* 5. Make sure we are in the special "init.scope" unit in the root slice. */ + scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r >= 0) { + /* Also, move all other userspace processes remaining in the root cgroup into that scope. */ + r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r < 0) + log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m"); + + /* 6. And pin it, so that it cannot be unmounted */ + safe_close(m->pin_cgroupfs_fd); + m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK); + if (m->pin_cgroupfs_fd < 0) + return log_error_errno(errno, "Failed to open pin file: %m"); + + } else if (!MANAGER_IS_TEST_RUN(m)) + return log_error_errno(r, "Failed to create %s control group: %m", scope_path); + + /* 7. Always enable hierarchical support if it exists... */ + if (!all_unified && !MANAGER_IS_TEST_RUN(m)) + (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); + + /* 8. Figure out which controllers are supported */ + r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported); + if (r < 0) + return log_error_errno(r, "Failed to determine supported controllers: %m"); + + /* 9. Figure out which bpf-based pseudo-controllers are supported */ + r = cg_bpf_mask_supported(&mask); + if (r < 0) + return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m"); + m->cgroup_supported |= mask; + + /* 10. Log which controllers are supported */ + for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) + log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), + yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c))); + + return 0; +} + +void manager_shutdown_cgroup(Manager *m, bool delete) { + assert(m); + + /* We can't really delete the group, since we are in it. But + * let's trim it. */ + if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL)) + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false); + + m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source); + + m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit); + m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit); + + m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source); + m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); + + m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd); + + m->cgroup_root = mfree(m->cgroup_root); +} + +Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { + char *p; + Unit *u; + + assert(m); + assert(cgroup); + + u = hashmap_get(m->cgroup_unit, cgroup); + if (u) + return u; + + p = strdupa_safe(cgroup); + for (;;) { + char *e; + + e = strrchr(p, '/'); + if (!e || e == p) + return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE); + + *e = 0; + + u = hashmap_get(m->cgroup_unit, p); + if (u) + return u; + } +} + +Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) { + _cleanup_free_ char *cgroup = NULL; + + assert(m); + + if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0) + return NULL; + + return manager_get_unit_by_cgroup(m, cgroup); +} + +Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) { + Unit *u, **array; + + assert(m); + + if (!pidref_is_set(pid)) + return NULL; + + u = hashmap_get(m->watch_pids, pid); + if (u) + return u; + + array = hashmap_get(m->watch_pids_more, pid); + if (array) + return array[0]; + + return NULL; +} + +Unit *manager_get_unit_by_pidref(Manager *m, PidRef *pid) { + Unit *u; + + assert(m); + + /* Note that a process might be owned by multiple units, we return only one here, which is good + * enough for most cases, though not strictly correct. We prefer the one reported by cgroup + * membership, as that's the most relevant one as children of the process will be assigned to that + * one, too, before all else. */ + + if (!pidref_is_set(pid)) + return NULL; + + if (pidref_is_self(pid)) + return hashmap_get(m->units, SPECIAL_INIT_SCOPE); + if (pid->pid == 1) + return NULL; + + u = manager_get_unit_by_pidref_cgroup(m, pid); + if (u) + return u; + + u = manager_get_unit_by_pidref_watching(m, pid); + if (u) + return u; + + return NULL; +} + +Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) { + assert(m); + + if (!pid_is_valid(pid)) + return NULL; + + return manager_get_unit_by_pidref(m, &PIDREF_MAKE_FROM_PID(pid)); +} + +int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { + Unit *u; + + assert(m); + assert(cgroup); + + /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process + * or from the --system instance */ + + log_debug("Got cgroup empty notification for: %s", cgroup); + + u = manager_get_unit_by_cgroup(m, cgroup); + if (!u) + return 0; + + unit_add_to_cgroup_empty_queue(u); + return 1; +} + +int unit_get_memory_available(Unit *u, uint64_t *ret) { + uint64_t available = UINT64_MAX, current = 0; + + assert(u); + assert(ret); + + /* If data from cgroups can be accessed, try to find out how much more memory a unit can + * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh + * and MemoryMax, and also any slice the unit might be nested below. */ + + do { + uint64_t unit_available, unit_limit = UINT64_MAX; + CGroupContext *unit_context; + + /* No point in continuing if we can't go any lower */ + if (available == 0) + break; + + unit_context = unit_get_cgroup_context(u); + if (!unit_context) + return -ENODATA; + + if (!u->cgroup_path) + continue; + + (void) unit_get_memory_current(u, ¤t); + /* in case of error, previous current propagates as lower bound */ + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + unit_limit = physical_memory(); + else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX) + continue; + unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high); + + unit_available = LESS_BY(unit_limit, current); + available = MIN(unit_available, available); + } while ((u = UNIT_GET_SLICE(u))); + + *ret = available; + + return 0; +} + +int unit_get_memory_current(Unit *u, uint64_t *ret) { + int r; + + // FIXME: Merge this into unit_get_memory_accounting after support for cgroup v1 is dropped + + assert(u); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, memory_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_memory_get_used(ret); + + if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + + return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret); +} + +int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) { + + static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = { + [CGROUP_MEMORY_PEAK] = "memory.peak", + [CGROUP_MEMORY_SWAP_CURRENT] = "memory.swap.current", + [CGROUP_MEMORY_SWAP_PEAK] = "memory.swap.peak", + [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current", + }; + + uint64_t bytes; + bool updated = false; + int r; + + assert(u); + assert(metric >= 0); + assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX); + + if (!UNIT_CGROUP_BOOL(u, memory_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + /* If the cgroup is already gone, we try to find the last cached value. */ + goto finish; + + /* The root cgroup doesn't expose this information. */ + if (unit_has_host_root_cgroup(u)) + return -ENODATA; + + if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_MEMORY)) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + + r = cg_get_attribute_as_uint64("memory", u->cgroup_path, attributes_table[metric], &bytes); + if (r < 0 && r != -ENODATA) + return r; + updated = r >= 0; + +finish: + if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) { + uint64_t *last = &u->memory_accounting_last[metric]; + + if (updated) + *last = bytes; + else if (*last != UINT64_MAX) + bytes = *last; + else + return -ENODATA; + + } else if (!updated) + return -ENODATA; + + if (ret) + *ret = bytes; + + return 0; +} + +int unit_get_tasks_current(Unit *u, uint64_t *ret) { + assert(u); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, tasks_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_tasks_get_current(ret); + + if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0) + return -ENODATA; + + return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret); +} + +static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { + uint64_t ns; + int r; + + assert(u); + assert(ret); + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_cpu_get_usage(ret); + + /* Requisite controllers for CPU accounting are not enabled */ + if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) { + _cleanup_free_ char *val = NULL; + uint64_t us; + + r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val); + if (IN_SET(r, -ENOENT, -ENXIO)) + return -ENODATA; + if (r < 0) + return r; + + r = safe_atou64(val, &us); + if (r < 0) + return r; + + ns = us * NSEC_PER_USEC; + } else + return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret); + + *ret = ns; + return 0; +} + +int unit_get_cpu_usage(Unit *u, nsec_t *ret) { + nsec_t ns; + int r; + + assert(u); + + /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was + * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply + * call this function with a NULL return value. */ + + if (!UNIT_CGROUP_BOOL(u, cpu_accounting)) + return -ENODATA; + + r = unit_get_cpu_usage_raw(u, &ns); + if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) { + /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our + * cached value. */ + + if (ret) + *ret = u->cpu_usage_last; + return 0; + } + if (r < 0) + return r; + + if (ns > u->cpu_usage_base) + ns -= u->cpu_usage_base; + else + ns = 0; + + u->cpu_usage_last = ns; + if (ret) + *ret = ns; + + return 0; +} + +int unit_get_ip_accounting( + Unit *u, + CGroupIPAccountingMetric metric, + uint64_t *ret) { + + uint64_t value; + int fd, r; + + assert(u); + assert(metric >= 0); + assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, ip_accounting)) + return -ENODATA; + + fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + if (fd < 0) + return -ENODATA; + + if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) + r = bpf_firewall_read_accounting(fd, &value, NULL); + else + r = bpf_firewall_read_accounting(fd, NULL, &value); + if (r < 0) + return r; + + /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile + * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the + * ip_accounting_extra[] field, and add them in here transparently. */ + + *ret = value + u->ip_accounting_extra[metric]; + + return r; +} + +static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) { + static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "rbytes=", + [CGROUP_IO_WRITE_BYTES] = "wbytes=", + [CGROUP_IO_READ_OPERATIONS] = "rios=", + [CGROUP_IO_WRITE_OPERATIONS] = "wios=", + }; + uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {}; + _cleanup_free_ char *path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(u); + + if (!u->cgroup_path) + return -ENODATA; + + if (unit_has_host_root_cgroup(u)) + return -ENODATA; /* TODO: return useful data for the top-level cgroup */ + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) /* TODO: support cgroupv1 */ + return -ENODATA; + + if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO)) + return -ENODATA; + + r = cg_get_path("io", u->cgroup_path, "io.stat", &path); + if (r < 0) + return r; + + f = fopen(path, "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + p = line; + p += strcspn(p, WHITESPACE); /* Skip over device major/minor */ + p += strspn(p, WHITESPACE); /* Skip over following whitespace */ + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) { + const char *x; + + x = startswith(word, field_names[i]); + if (x) { + uint64_t w; + + r = safe_atou64(x, &w); + if (r < 0) + return r; + + /* Sum up the stats of all devices */ + acc[i] += w; + break; + } + } + } + } + + memcpy(ret, acc, sizeof(acc)); + return 0; +} + +int unit_get_io_accounting( + Unit *u, + CGroupIOAccountingMetric metric, + bool allow_cache, + uint64_t *ret) { + + uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; + int r; + + /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */ + + if (!UNIT_CGROUP_BOOL(u, io_accounting)) + return -ENODATA; + + if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX) + goto done; + + r = unit_get_io_accounting_raw(u, raw); + if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX) + goto done; + if (r < 0) + return r; + + for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) { + /* Saturated subtraction */ + if (raw[i] > u->io_accounting_base[i]) + u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i]; + else + u->io_accounting_last[i] = 0; + } + +done: + if (ret) + *ret = u->io_accounting_last[metric]; + + return 0; +} + +int unit_reset_cpu_accounting(Unit *u) { + int r; + + assert(u); + + u->cpu_usage_last = NSEC_INFINITY; + + r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base); + if (r < 0) { + u->cpu_usage_base = 0; + return r; + } + + return 0; +} + +void unit_reset_memory_accounting_last(Unit *u) { + assert(u); + + FOREACH_ARRAY(i, u->memory_accounting_last, ELEMENTSOF(u->memory_accounting_last)) + *i = UINT64_MAX; +} + +int unit_reset_ip_accounting(Unit *u) { + int r = 0; + + assert(u); + + if (u->ip_accounting_ingress_map_fd >= 0) + RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd)); + + if (u->ip_accounting_egress_map_fd >= 0) + RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd)); + + zero(u->ip_accounting_extra); + + return r; +} + +void unit_reset_io_accounting_last(Unit *u) { + assert(u); + + FOREACH_ARRAY(i, u->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX) + *i = UINT64_MAX; +} + +int unit_reset_io_accounting(Unit *u) { + int r; + + assert(u); + + unit_reset_io_accounting_last(u); + + r = unit_get_io_accounting_raw(u, u->io_accounting_base); + if (r < 0) { + zero(u->io_accounting_base); + return r; + } + + return 0; +} + +int unit_reset_accounting(Unit *u) { + int r = 0; + + assert(u); + + RET_GATHER(r, unit_reset_cpu_accounting(u)); + RET_GATHER(r, unit_reset_io_accounting(u)); + RET_GATHER(r, unit_reset_ip_accounting(u)); + unit_reset_memory_accounting_last(u); + + return r; +} + +void unit_invalidate_cgroup(Unit *u, CGroupMask m) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (m == 0) + return; + + /* always invalidate compat pairs together */ + if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)) + m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + + if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT)) + m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT; + + if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */ + return; + + u->cgroup_invalidated_mask |= m; + unit_add_to_cgroup_realize_queue(u); +} + +void unit_invalidate_cgroup_bpf(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */ + return; + + u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + unit_add_to_cgroup_realize_queue(u); + + /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access + * list of our children includes our own. */ + if (u->type == UNIT_SLICE) { + Unit *member; + + UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF) + unit_invalidate_cgroup_bpf(member); + } +} + +void unit_cgroup_catchup(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + /* We dropped the inotify watch during reexec/reload, so we need to + * check these as they may have changed. + * Note that (currently) the kernel doesn't actually update cgroup + * file modification times, so we can't just serialize and then check + * the mtime for file(s) we are interested in. */ + (void) unit_check_cgroup_events(u); + unit_add_to_cgroup_oom_queue(u); +} + +bool unit_cgroup_delegate(Unit *u) { + CGroupContext *c; + + assert(u); + + if (!UNIT_VTABLE(u)->can_delegate) + return false; + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return c->delegate; +} + +void manager_invalidate_startup_units(Manager *m) { + Unit *u; + + assert(m); + + SET_FOREACH(u, m->startup_units) + unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET); +} + +int unit_cgroup_freezer_action(Unit *u, FreezerAction action) { + _cleanup_free_ char *path = NULL; + FreezerState target, kernel = _FREEZER_STATE_INVALID; + int r, ret; + + assert(u); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + if (!cg_freezer_supported()) + return 0; + + /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE)) + return action == FREEZER_FREEZE ? -EPERM : 0; + + if (!u->cgroup_realized) + return -EBUSY; + + if (action == FREEZER_THAW) { + Unit *slice = UNIT_GET_SLICE(u); + + if (slice) { + r = unit_cgroup_freezer_action(slice, FREEZER_THAW); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id); + } + } + + target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING; + + r = unit_freezer_state_kernel(u, &kernel); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m"); + + if (target == kernel) { + u->freezer_state = target; + if (action == FREEZER_FREEZE) + return 0; + ret = 0; + } else + ret = 1; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path); + if (r < 0) + return r; + + log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing"); + + if (target != kernel) { + if (action == FREEZER_FREEZE) + u->freezer_state = FREEZER_FREEZING; + else + u->freezer_state = FREEZER_THAWING; + } + + r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + return ret; +} + +int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + _cleanup_free_ char *v = NULL; + int r; + + assert(u); + assert(cpus); + + if (!u->cgroup_path) + return -ENODATA; + + if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + + r = cg_get_attribute("cpuset", u->cgroup_path, name, &v); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL); +} + +static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { + [CGROUP_DEVICE_POLICY_AUTO] = "auto", + [CGROUP_DEVICE_POLICY_CLOSED] = "closed", + [CGROUP_DEVICE_POLICY_STRICT] = "strict", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); + +static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { + [FREEZER_FREEZE] = "freeze", + [FREEZER_THAW] = "thaw", +}; + +DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction); + +static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = { + [CGROUP_PRESSURE_WATCH_OFF] = "off", + [CGROUP_PRESSURE_WATCH_AUTO] = "auto", + [CGROUP_PRESSURE_WATCH_ON] = "on", + [CGROUP_PRESSURE_WATCH_SKIP] = "skip", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON); + +static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "IPIngressBytes", + [CGROUP_IP_EGRESS_BYTES] = "IPEgressBytes", + [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets", + [CGROUP_IP_EGRESS_PACKETS] = "IPEgressPackets", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric); + +static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "IOReadBytes", + [CGROUP_IO_WRITE_BYTES] = "IOWriteBytes", + [CGROUP_IO_READ_OPERATIONS] = "IOReadOperations", + [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric); + +static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = { + [CGROUP_MEMORY_PEAK] = "MemoryPeak", + [CGROUP_MEMORY_SWAP_CURRENT] = "MemorySwapCurrent", + [CGROUP_MEMORY_SWAP_PEAK] = "MemorySwapPeak", + [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric); diff --git a/src/core/cgroup.h b/src/core/cgroup.h new file mode 100644 index 0000000..f1b674b --- /dev/null +++ b/src/core/cgroup.h @@ -0,0 +1,429 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bpf-lsm.h" +#include "cgroup-util.h" +#include "cpu-set-util.h" +#include "firewall-util.h" +#include "list.h" +#include "pidref.h" +#include "time-util.h" + +typedef struct CGroupTasksMax { + /* If scale == 0, just use value; otherwise, value / scale. + * See tasks_max_resolve(). */ + uint64_t value; + uint64_t scale; +} CGroupTasksMax; + +#define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 }) + +static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) { + return tasks_max->value != UINT64_MAX || tasks_max->scale != 0; +} + +uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max); + +typedef struct CGroupContext CGroupContext; +typedef struct CGroupDeviceAllow CGroupDeviceAllow; +typedef struct CGroupIODeviceWeight CGroupIODeviceWeight; +typedef struct CGroupIODeviceLimit CGroupIODeviceLimit; +typedef struct CGroupIODeviceLatency CGroupIODeviceLatency; +typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight; +typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth; +typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram; +typedef struct CGroupSocketBindItem CGroupSocketBindItem; + +typedef enum CGroupDevicePolicy { + /* When devices listed, will allow those, plus built-in ones, if none are listed will allow + * everything. */ + CGROUP_DEVICE_POLICY_AUTO, + + /* Everything forbidden, except built-in ones and listed ones. */ + CGROUP_DEVICE_POLICY_CLOSED, + + /* Everything forbidden, except for the listed devices */ + CGROUP_DEVICE_POLICY_STRICT, + + _CGROUP_DEVICE_POLICY_MAX, + _CGROUP_DEVICE_POLICY_INVALID = -EINVAL, +} CGroupDevicePolicy; + +typedef enum FreezerAction { + FREEZER_FREEZE, + FREEZER_THAW, + + _FREEZER_ACTION_MAX, + _FREEZER_ACTION_INVALID = -EINVAL, +} FreezerAction; + +typedef enum CGroupDevicePermissions { + /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */ + CGROUP_DEVICE_MKNOD = 1 << 0, + CGROUP_DEVICE_READ = 1 << 1, + CGROUP_DEVICE_WRITE = 1 << 2, + _CGROUP_DEVICE_PERMISSIONS_MAX = 1 << 3, + _CGROUP_DEVICE_PERMISSIONS_ALL = _CGROUP_DEVICE_PERMISSIONS_MAX - 1, + _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL, +} CGroupDevicePermissions; + +struct CGroupDeviceAllow { + LIST_FIELDS(CGroupDeviceAllow, device_allow); + char *path; + CGroupDevicePermissions permissions; +}; + +struct CGroupIODeviceWeight { + LIST_FIELDS(CGroupIODeviceWeight, device_weights); + char *path; + uint64_t weight; +}; + +struct CGroupIODeviceLimit { + LIST_FIELDS(CGroupIODeviceLimit, device_limits); + char *path; + uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; +}; + +struct CGroupIODeviceLatency { + LIST_FIELDS(CGroupIODeviceLatency, device_latencies); + char *path; + usec_t target_usec; +}; + +struct CGroupBlockIODeviceWeight { + LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights); + char *path; + uint64_t weight; +}; + +struct CGroupBlockIODeviceBandwidth { + LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths); + char *path; + uint64_t rbps; + uint64_t wbps; +}; + +struct CGroupBPFForeignProgram { + LIST_FIELDS(CGroupBPFForeignProgram, programs); + uint32_t attach_type; + char *bpffs_path; +}; + +struct CGroupSocketBindItem { + LIST_FIELDS(CGroupSocketBindItem, socket_bind_items); + int address_family; + int ip_protocol; + uint16_t nr_ports; + uint16_t port_min; +}; + +typedef enum CGroupPressureWatch { + CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */ + CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */ + CGROUP_PRESSURE_WATCH_ON, + CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */ + _CGROUP_PRESSURE_WATCH_MAX, + _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL, +} CGroupPressureWatch; + +struct CGroupContext { + bool cpu_accounting; + bool io_accounting; + bool blockio_accounting; + bool memory_accounting; + bool tasks_accounting; + bool ip_accounting; + + /* Configures the memory.oom.group attribute (on unified) */ + bool memory_oom_group; + + bool delegate; + CGroupMask delegate_controllers; + CGroupMask disable_controllers; + char *delegate_subgroup; + + /* For unified hierarchy */ + uint64_t cpu_weight; + uint64_t startup_cpu_weight; + usec_t cpu_quota_per_sec_usec; + usec_t cpu_quota_period_usec; + + CPUSet cpuset_cpus; + CPUSet startup_cpuset_cpus; + CPUSet cpuset_mems; + CPUSet startup_cpuset_mems; + + uint64_t io_weight; + uint64_t startup_io_weight; + LIST_HEAD(CGroupIODeviceWeight, io_device_weights); + LIST_HEAD(CGroupIODeviceLimit, io_device_limits); + LIST_HEAD(CGroupIODeviceLatency, io_device_latencies); + + uint64_t default_memory_min; + uint64_t default_memory_low; + uint64_t default_startup_memory_low; + uint64_t memory_min; + uint64_t memory_low; + uint64_t startup_memory_low; + uint64_t memory_high; + uint64_t startup_memory_high; + uint64_t memory_max; + uint64_t startup_memory_max; + uint64_t memory_swap_max; + uint64_t startup_memory_swap_max; + uint64_t memory_zswap_max; + uint64_t startup_memory_zswap_max; + + bool default_memory_min_set:1; + bool default_memory_low_set:1; + bool default_startup_memory_low_set:1; + bool memory_min_set:1; + bool memory_low_set:1; + bool startup_memory_low_set:1; + bool startup_memory_high_set:1; + bool startup_memory_max_set:1; + bool startup_memory_swap_max_set:1; + bool startup_memory_zswap_max_set:1; + + Set *ip_address_allow; + Set *ip_address_deny; + /* These two flags indicate that redundant entries have been removed from + * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */ + bool ip_address_allow_reduced; + bool ip_address_deny_reduced; + + char **ip_filters_ingress; + char **ip_filters_egress; + LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs); + + Set *restrict_network_interfaces; + bool restrict_network_interfaces_is_allow_list; + + /* For legacy hierarchies */ + uint64_t cpu_shares; + uint64_t startup_cpu_shares; + + uint64_t blockio_weight; + uint64_t startup_blockio_weight; + LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights); + LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths); + + uint64_t memory_limit; + + CGroupDevicePolicy device_policy; + LIST_HEAD(CGroupDeviceAllow, device_allow); + + LIST_HEAD(CGroupSocketBindItem, socket_bind_allow); + LIST_HEAD(CGroupSocketBindItem, socket_bind_deny); + + /* Common */ + CGroupTasksMax tasks_max; + + /* Settings for systemd-oomd */ + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; + uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */ + ManagedOOMPreference moom_preference; + + /* Memory pressure logic */ + CGroupPressureWatch memory_pressure_watch; + usec_t memory_pressure_threshold_usec; + /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple + * triggers, nor triggers for non-memory pressure. We might add that later. */ + + NFTSetContext nft_set_context; + + /* Forward coredumps for processes that crash within this cgroup. + * Requires 'delegate' to also be true. */ + bool coredump_receive; +}; + +/* Used when querying IP accounting data */ +typedef enum CGroupIPAccountingMetric { + CGROUP_IP_INGRESS_BYTES, + CGROUP_IP_INGRESS_PACKETS, + CGROUP_IP_EGRESS_BYTES, + CGROUP_IP_EGRESS_PACKETS, + _CGROUP_IP_ACCOUNTING_METRIC_MAX, + _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL, +} CGroupIPAccountingMetric; + +/* Used when querying IO accounting data */ +typedef enum CGroupIOAccountingMetric { + CGROUP_IO_READ_BYTES, + CGROUP_IO_WRITE_BYTES, + CGROUP_IO_READ_OPERATIONS, + CGROUP_IO_WRITE_OPERATIONS, + _CGROUP_IO_ACCOUNTING_METRIC_MAX, + _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL, +} CGroupIOAccountingMetric; + +typedef enum CGroupMemoryAccountingMetric { + CGROUP_MEMORY_PEAK, + CGROUP_MEMORY_SWAP_PEAK, + /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g. + * when systemd-run exits. */ + _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST = CGROUP_MEMORY_SWAP_PEAK, + + /* These attributes are transient, so no need for caching. */ + CGROUP_MEMORY_SWAP_CURRENT, + CGROUP_MEMORY_ZSWAP_CURRENT, + + _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX, + _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL, +} CGroupMemoryAccountingMetric; + +typedef struct Unit Unit; +typedef struct Manager Manager; +typedef enum ManagerState ManagerState; + +uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state); + +usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period); + +void cgroup_context_init(CGroupContext *c); +void cgroup_context_done(CGroupContext *c); +void cgroup_context_dump(Unit *u, FILE* f, const char *prefix); +void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f); +void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f); + +void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a); +void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w); +void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l); +void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l); +void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w); +void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b); +void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p); +void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head); + +static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) { + assert(c); + + return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON || + (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting); +} + +int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p); +int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p); +int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path); + +void unit_modify_nft_set(Unit *u, bool add); + +CGroupMask unit_get_own_mask(Unit *u); +CGroupMask unit_get_delegate_mask(Unit *u); +CGroupMask unit_get_members_mask(Unit *u); +CGroupMask unit_get_siblings_mask(Unit *u); +CGroupMask unit_get_ancestor_disable_mask(Unit *u); + +CGroupMask unit_get_target_mask(Unit *u); +CGroupMask unit_get_enable_mask(Unit *u); + +void unit_invalidate_cgroup_members_masks(Unit *u); + +void unit_add_family_to_cgroup_realize_queue(Unit *u); + +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); +int unit_default_cgroup_path(const Unit *u, char **ret); +int unit_set_cgroup_path(Unit *u, const char *path); +int unit_pick_cgroup_path(Unit *u); + +int unit_realize_cgroup(Unit *u); +void unit_prune_cgroup(Unit *u); +int unit_watch_cgroup(Unit *u); +int unit_watch_cgroup_memory(Unit *u); +void unit_add_to_cgroup_realize_queue(Unit *u); + +void unit_release_cgroup(Unit *u); +/* Releases the cgroup only if it is recursively empty. + * Returns true if the cgroup was released, false otherwise. */ +bool unit_maybe_release_cgroup(Unit *u); + +void unit_add_to_cgroup_empty_queue(Unit *u); +int unit_check_oomd_kill(Unit *u); +int unit_check_oom(Unit *u); + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path); + +int manager_setup_cgroup(Manager *m); +void manager_shutdown_cgroup(Manager *m, bool delete); + +unsigned manager_dispatch_cgroup_realize_queue(Manager *m); + +Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup); +Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid); +Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid); +Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid); +Unit* manager_get_unit_by_pid(Manager *m, pid_t pid); + +uint64_t unit_get_ancestor_memory_min(Unit *u); +uint64_t unit_get_ancestor_memory_low(Unit *u); +uint64_t unit_get_ancestor_startup_memory_low(Unit *u); + +int unit_search_main_pid(Unit *u, PidRef *ret); +int unit_watch_all_pids(Unit *u); + +int unit_synthesize_cgroup_empty_event(Unit *u); + +int unit_get_memory_available(Unit *u, uint64_t *ret); +int unit_get_memory_current(Unit *u, uint64_t *ret); +int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret); +int unit_get_tasks_current(Unit *u, uint64_t *ret); +int unit_get_cpu_usage(Unit *u, nsec_t *ret); +int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret); +int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); + +int unit_reset_cpu_accounting(Unit *u); +void unit_reset_memory_accounting_last(Unit *u); +int unit_reset_ip_accounting(Unit *u); +void unit_reset_io_accounting_last(Unit *u); +int unit_reset_io_accounting(Unit *u); +int unit_reset_accounting(Unit *u); + +#define UNIT_CGROUP_BOOL(u, name) \ + ({ \ + CGroupContext *cc = unit_get_cgroup_context(u); \ + cc ? cc->name : false; \ + }) + +bool manager_owns_host_root_cgroup(Manager *m); +bool unit_has_host_root_cgroup(Unit *u); + +bool unit_has_startup_cgroup_constraints(Unit *u); + +int manager_notify_cgroup_empty(Manager *m, const char *group); + +void unit_invalidate_cgroup(Unit *u, CGroupMask m); +void unit_invalidate_cgroup_bpf(Unit *u); + +void manager_invalidate_startup_units(Manager *m); + +const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_; +CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_; + +void unit_cgroup_catchup(Unit *u); + +bool unit_cgroup_delegate(Unit *u); + +int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name); +int unit_cgroup_freezer_action(Unit *u, FreezerAction action); + +const char* freezer_action_to_string(FreezerAction a) _const_; +FreezerAction freezer_action_from_string(const char *s) _pure_; + +const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_; +CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_; + +const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_; +CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_; + +const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_; +CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_; + +const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_; +CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_; + +const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_; +CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_; diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c new file mode 100644 index 0000000..cd91381 --- /dev/null +++ b/src/core/core-varlink.c @@ -0,0 +1,652 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "core-varlink.h" +#include "mkdir-label.h" +#include "strv.h" +#include "user-util.h" +#include "varlink.h" +#include "varlink-io.systemd.UserDatabase.h" +#include "varlink-io.systemd.ManagedOOM.h" + +typedef struct LookupParameters { + const char *user_name; + const char *group_name; + union { + uid_t uid; + gid_t gid; + }; + const char *service; +} LookupParameters; + +static const char* const managed_oom_mode_properties[] = { + "ManagedOOMSwap", + "ManagedOOMMemoryPressure", +}; + +static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) { + assert(user_name); + assert(uid_is_valid(uid)); + assert(ret); + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(uid)), + JSON_BUILD_PAIR("realName", JSON_BUILD_CONST_STRING("Dynamic User")), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")), + JSON_BUILD_PAIR("shell", JSON_BUILD_CONST_STRING(NOLOGIN)), + JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic")))))); +} + +static bool user_match_lookup_parameters(LookupParameters *p, const char *name, uid_t uid) { + assert(p); + + if (p->user_name && !streq(name, p->user_name)) + return false; + + if (uid_is_valid(p->uid) && uid != p->uid) + return false; + + return true; +} + +static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) { + bool use_limit = false; + CGroupContext *c; + const char *mode; + + assert(u); + assert(property); + assert(ret_v); + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return -EOPNOTSUPP; + + c = unit_get_cgroup_context(u); + if (!c) + return -EINVAL; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + /* systemd-oomd should always treat inactive units as though they didn't enable any action since they + * should not have a valid cgroup */ + mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO); + else if (streq(property, "ManagedOOMSwap")) + mode = managed_oom_mode_to_string(c->moom_swap); + else if (streq(property, "ManagedOOMMemoryPressure")) { + mode = managed_oom_mode_to_string(c->moom_mem_pressure); + use_limit = true; + } else + return -EINVAL; + + return json_build(ret_v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)), + JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)), + JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)), + JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)))); +} + +int manager_varlink_send_managed_oom_update(Unit *u) { + _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL; + CGroupContext *c; + int r; + + assert(u); + + if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path) + return 0; + + if (MANAGER_IS_SYSTEM(u->manager)) { + /* In system mode we can't send any notifications unless oomd connected back to us. In this + * mode oomd must initiate communication, not us. */ + if (!u->manager->managed_oom_varlink) + return 0; + } else { + /* If we are in user mode, let's connect to oomd if we aren't connected yet. In this mode we + * must initiate communication to oomd, not the other way round. */ + r = manager_varlink_init(u->manager); + if (r <= 0) + return r; + } + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY); + if (r < 0) + return r; + + for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e); + if (r < 0) + return r; + + r = json_variant_append_array(&arr, e); + if (r < 0) + return r; + } + + r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr)))); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(u->manager)) + /* in system mode, oomd is our client, thus send out notifications as replies to the + * initiating method call from them. */ + r = varlink_notify(u->manager->managed_oom_varlink, v); + else + /* in user mode, we are oomd's client, thus send out notifications as method calls that do + * not expect a reply. */ + r = varlink_send(u->manager->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v); + + return r; +} + +static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL; + int r; + + assert(m); + assert(ret); + + r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY); + if (r < 0) + return r; + + for (UnitType t = 0; t < _UNIT_TYPE_MAX; t++) { + + if (!unit_vtable[t]->can_set_managed_oom) + continue; + + LIST_FOREACH(units_by_type, u, m->units_by_type[t]) { + CGroupContext *c; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + continue; + + c = unit_get_cgroup_context(u); + if (!c) + continue; + + for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + /* For the initial varlink call we only care about units that enabled (i.e. mode is not + * set to "auto") oomd properties. */ + if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) && + !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL)) + continue; + + r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e); + if (r < 0) + return r; + + r = json_variant_append_array(&arr, e); + if (r < 0) + return r; + } + } + } + + r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr)))); + if (r < 0) + return r; + + *ret = TAKE_PTR(v); + return 0; +} + +static int vl_method_subscribe_managed_oom_cgroups( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + Manager *m = ASSERT_PTR(userdata); + pid_t pid; + Unit *u; + int r; + + assert(link); + + r = varlink_get_peer_pid(link, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + /* This is meant to be a deterrent and not actual security. The alternative is to check for the systemd-oom + * user that this unit runs as, but NSS lookups are blocking and not allowed from PID 1. */ + if (!streq(u->id, "systemd-oomd.service")) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + /* We only take one subscriber for this method so return an error if there's already an existing one. + * This shouldn't happen since systemd-oomd is the only client of this method. */ + if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink) + return varlink_error(link, "io.systemd.ManagedOOM.SubscriptionTaken", NULL); + + r = build_managed_oom_cgroups_json(m, &v); + if (r < 0) + return r; + + if (!FLAGS_SET(flags, VARLINK_METHOD_MORE)) + return varlink_reply(link, v); + + assert(!m->managed_oom_varlink); + m->managed_oom_varlink = varlink_ref(link); + return varlink_notify(m->managed_oom_varlink, v); +} + +static int manager_varlink_send_managed_oom_initial(Manager *m) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(m); + + if (MANAGER_IS_SYSTEM(m)) + return 0; + + assert(m->managed_oom_varlink); + + r = build_managed_oom_cgroups_json(m, &v); + if (r < 0) + return r; + + return varlink_send(m->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v); +} + +static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 }, + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .uid = UID_INVALID, + }; + _cleanup_free_ char *found_name = NULL; + uid_t found_uid = UID_INVALID, uid; + Manager *m = ASSERT_PTR(userdata); + const char *un; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.DynamicUser")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (uid_is_valid(p.uid)) + r = dynamic_user_lookup_uid(m, p.uid, &found_name); + else if (p.user_name) + r = dynamic_user_lookup_name(m, p.user_name, &found_uid); + else { + DynamicUser *d; + + HASHMAP_FOREACH(d, m->dynamic_users) { + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) /* not realized yet? */ + continue; + if (r < 0) + return r; + + if (!user_match_lookup_parameters(&p, d->name, uid)) + continue; + + if (v) { + r = varlink_notify(link, v); + if (r < 0) + return r; + + v = json_variant_unref(v); + } + + r = build_user_json(d->name, uid, &v); + if (r < 0) + return r; + } + + if (!v) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + + return varlink_reply(link, v); + } + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + uid = uid_is_valid(found_uid) ? found_uid : p.uid; + un = found_name ?: p.user_name; + + if (!user_match_lookup_parameters(&p, un, uid)) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_user_json(un, uid, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret) { + assert(group_name); + assert(gid_is_valid(gid)); + assert(ret); + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(group_name)), + JSON_BUILD_PAIR("description", JSON_BUILD_CONST_STRING("Dynamic Group")), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic")))))); + } + +static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) { + assert(p); + + if (p->group_name && !streq(name, p->group_name)) + return false; + + if (gid_is_valid(p->gid) && gid != p->gid) + return false; + + return true; +} + +static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .gid = GID_INVALID, + }; + _cleanup_free_ char *found_name = NULL; + uid_t found_gid = GID_INVALID, gid; + Manager *m = ASSERT_PTR(userdata); + const char *gn; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.DynamicUser")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (gid_is_valid(p.gid)) + r = dynamic_user_lookup_uid(m, (uid_t) p.gid, &found_name); + else if (p.group_name) + r = dynamic_user_lookup_name(m, p.group_name, (uid_t*) &found_gid); + else { + DynamicUser *d; + + HASHMAP_FOREACH(d, m->dynamic_users) { + uid_t uid; + + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) + continue; + if (r < 0) + return r; + + if (!group_match_lookup_parameters(&p, d->name, (gid_t) uid)) + continue; + + if (v) { + r = varlink_notify(link, v); + if (r < 0) + return r; + + v = json_variant_unref(v); + } + + r = build_group_json(d->name, (gid_t) uid, &v); + if (r < 0) + return r; + } + + if (!v) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + + return varlink_reply(link, v); + } + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + gid = gid_is_valid(found_gid) ? found_gid : p.gid; + gn = found_name ?: p.group_name; + + if (!group_match_lookup_parameters(&p, gn, gid)) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_group_json(gn, gid, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + LookupParameters p = {}; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.DynamicUser")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + /* We don't support auxiliary groups with dynamic users. */ + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + assert(link); + + if (link == m->managed_oom_varlink) + m->managed_oom_varlink = varlink_unref(link); +} + +static int manager_varlink_init_system(Manager *m) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + + if (m->varlink_server) + return 1; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + r = manager_setup_varlink_server(m, &s); + if (r < 0) + return log_error_errno(r, "Failed to set up varlink server: %m"); + + if (!MANAGER_IS_TEST_RUN(m)) { + (void) mkdir_p_label("/run/systemd/userdb", 0755); + + FOREACH_STRING(address, "/run/systemd/userdb/io.systemd.DynamicUser", VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM) { + if (MANAGER_IS_RELOADING(m)) { + /* If manager is reloading, we skip listening on existing addresses, since + * the fd should be acquired later through deserialization. */ + if (access(address, F_OK) >= 0) + continue; + if (errno != ENOENT) + return log_error_errno(errno, + "Failed to check if varlink socket '%s' exists: %m", address); + } + + r = varlink_server_listen_address(s, address, 0666); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket '%s': %m", address); + } + } + + r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + m->varlink_server = TAKE_PTR(s); + return 1; +} + +static int vl_reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + int r; + + if (error_id) + log_debug("varlink systemd-oomd client error: %s", error_id); + + if (FLAGS_SET(flags, VARLINK_REPLY_ERROR) && FLAGS_SET(flags, VARLINK_REPLY_LOCAL)) { + /* Varlink connection was closed, likely because of systemd-oomd restart. Let's try to + * reconnect and send the initial ManagedOOM update again. */ + + m->managed_oom_varlink = varlink_unref(link); + + log_debug("Reconnecting to %s", VARLINK_ADDR_PATH_MANAGED_OOM_USER); + + r = manager_varlink_init(m); + if (r <= 0) + return r; + } + + return 0; +} + +static int manager_varlink_init_user(Manager *m) { + _cleanup_(varlink_close_unrefp) Varlink *link = NULL; + int r; + + assert(m); + + if (m->managed_oom_varlink) + return 1; + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_USER); + if (r < 0) { + if (r == -ENOENT || ERRNO_IS_DISCONNECT(r)) { + log_debug("systemd-oomd varlink unix socket not found, skipping user manager varlink setup"); + return 0; + } + return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM_USER); + } + + varlink_set_userdata(link, m); + + r = varlink_bind_reply(link, vl_reply); + if (r < 0) + return r; + + r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + m->managed_oom_varlink = TAKE_PTR(link); + + /* Queue the initial ManagedOOM update. */ + (void) manager_varlink_send_managed_oom_initial(m); + + return 1; +} + +int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + assert(ret); + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_debug_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_add_interface_many( + s, + &vl_interface_io_systemd_UserDatabase, + &vl_interface_io_systemd_ManagedOOM); + if (r < 0) + return log_error_errno(r, "Failed to add interfaces to varlink server: %m"); + + r = varlink_server_bind_method_many( + s, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships, + "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups); + if (r < 0) + return log_debug_errno(r, "Failed to register varlink methods: %m"); + + r = varlink_server_bind_disconnect(s, vl_disconnect); + if (r < 0) + return log_debug_errno(r, "Failed to register varlink disconnect handler: %m"); + + *ret = TAKE_PTR(s); + return 0; +} + +int manager_varlink_init(Manager *m) { + return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m); +} + +void manager_varlink_done(Manager *m) { + assert(m); + + /* Explicitly close the varlink connection to oomd. Note we first take the varlink connection out of + * the manager, and only then disconnect it — in two steps – so that we don't end up accidentally + * unreffing it twice. After all, closing the connection might cause the disconnect handler we + * installed (vl_disconnect() above) to be called, where we will unref it too. */ + varlink_close_unref(TAKE_PTR(m->managed_oom_varlink)); + + m->varlink_server = varlink_server_unref(m->varlink_server); + m->managed_oom_varlink = varlink_close_unref(m->managed_oom_varlink); +} diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h new file mode 100644 index 0000000..7f810d1 --- /dev/null +++ b/src/core/core-varlink.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "manager.h" + +int manager_varlink_init(Manager *m); +void manager_varlink_done(Manager *m); + +/* Creates a new VarlinkServer and binds methods. Does not set up sockets or attach events. + * Used for manager serialize/deserialize. */ +int manager_setup_varlink_server(Manager *m, VarlinkServer **ret_s); + +/* The manager is expected to send an update to systemd-oomd if one of the following occurs: + * - The value of ManagedOOM*= properties change + * - A unit with ManagedOOM*= properties changes unit active state */ +int manager_varlink_send_managed_oom_update(Unit *u); diff --git a/src/core/crash-handler.c b/src/core/crash-handler.c new file mode 100644 index 0000000..f5c31b6 --- /dev/null +++ b/src/core/crash-handler.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-messages.h" + +#include "crash-handler.h" +#include "exit-status.h" +#include "macro.h" +#include "main.h" +#include "missing_syscall.h" +#include "process-util.h" +#include "raw-clone.h" +#include "rlimit-util.h" +#include "signal-util.h" +#include "terminal-util.h" +#include "virt.h" + +_noreturn_ void freeze_or_exit_or_reboot(void) { + + /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to + * the container manager, and thus inform it that something went wrong. */ + if (detect_container() > 0) { + log_struct(LOG_EMERG, + LOG_MESSAGE("Exiting PID 1..."), + "MESSAGE_ID=" SD_MESSAGE_CRASH_EXIT_STR); + _exit(EXIT_EXCEPTION); + } + + if (arg_crash_reboot) { + log_notice("Rebooting in 10s..."); + (void) sleep(10); + + log_notice("Rebooting now..."); + (void) reboot(RB_AUTOBOOT); + log_struct_errno(LOG_EMERG, errno, + LOG_MESSAGE("Failed to reboot: %m"), + "MESSAGE_ID=" SD_MESSAGE_CRASH_FAILED_STR); + } + + log_struct(LOG_EMERG, + LOG_MESSAGE("Freezing execution."), + "MESSAGE_ID=" SD_MESSAGE_CRASH_FREEZE_STR); + sync(); + freeze(); +} + +_noreturn_ static void crash(int sig, siginfo_t *siginfo, void *context) { + struct sigaction sa; + pid_t pid; + + /* NB: 💣 💣 💣 This is a signal handler, most likely executed in a situation where we have corrupted + * memory. Thus: please avoid any libc memory allocation here, or any functions that internally use + * memory allocation, as we cannot rely on memory allocation still working at this point! (Note that + * memory allocation is not async-signal-safe anyway — see signal-safety(7) for details —, and thus + * is not permissible in signal handlers.) */ + + if (getpid_cached() != 1) + /* Pass this on immediately, if this is not PID 1 */ + propagate_signal(sig, siginfo); + else if (!arg_dump_core) + log_struct(LOG_EMERG, + LOG_MESSAGE("Caught <%s>, not dumping core.", signal_to_string(sig)), + "MESSAGE_ID=" SD_MESSAGE_CRASH_NO_COREDUMP_STR); + else { + sa = (struct sigaction) { + .sa_handler = nop_signal_handler, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + + /* We want to wait for the core process, hence let's enable SIGCHLD */ + (void) sigaction(SIGCHLD, &sa, NULL); + + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_struct_errno(LOG_EMERG, errno, + LOG_MESSAGE("Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig)), + "MESSAGE_ID=" SD_MESSAGE_CRASH_NO_FORK_STR); + else if (pid == 0) { + /* Enable default signal handler for core dump */ + + sa = (struct sigaction) { + .sa_handler = SIG_DFL, + }; + (void) sigaction(sig, &sa, NULL); + + /* Don't limit the coredump size */ + (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)); + + /* Just to be sure... */ + (void) chdir("/"); + + /* Raise the signal again */ + propagate_signal(sig, siginfo); + assert_not_reached(); + _exit(EXIT_EXCEPTION); + } else { + siginfo_t status; + int r; + + if (siginfo) { + if (siginfo->si_pid == 0) + log_struct(LOG_EMERG, + LOG_MESSAGE("Caught <%s>, from unknown sender process.", signal_to_string(sig)), + "MESSAGE_ID=" SD_MESSAGE_CRASH_UNKNOWN_SIGNAL_STR); + else if (siginfo->si_pid == 1) + log_struct(LOG_EMERG, + LOG_MESSAGE("Caught <%s>, from our own process.", signal_to_string(sig)), + "MESSAGE_ID=" SD_MESSAGE_CRASH_SYSTEMD_SIGNAL_STR); + else + log_struct(LOG_EMERG, + LOG_MESSAGE("Caught <%s> from PID "PID_FMT".", signal_to_string(sig), siginfo->si_pid), + "MESSAGE_ID=" SD_MESSAGE_CRASH_PROCESS_SIGNAL_STR); + } + + /* Order things nicely. */ + r = wait_for_terminate(pid, &status); + if (r < 0) + log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Caught <%s>, waitpid() failed: %m", signal_to_string(sig)), + "MESSAGE_ID=" SD_MESSAGE_CRASH_WAITPID_FAILED_STR); + else if (status.si_code != CLD_DUMPED) { + const char *s = status.si_code == CLD_EXITED ? + exit_status_to_string(status.si_status, EXIT_STATUS_LIBC) : + signal_to_string(status.si_status); + + log_struct(LOG_EMERG, + LOG_MESSAGE("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).", + signal_to_string(sig), + pid, + sigchld_code_to_string(status.si_code), + status.si_status, + strna(s)), + "MESSAGE_ID=" SD_MESSAGE_CRASH_COREDUMP_FAILED_STR); + } else + log_struct(LOG_EMERG, + LOG_MESSAGE("Caught <%s>, dumped core as pid "PID_FMT".", + signal_to_string(sig), pid), + "MESSAGE_ID=" SD_MESSAGE_CRASH_COREDUMP_PID_STR); + } + } + + if (arg_crash_chvt >= 0) + (void) chvt(arg_crash_chvt); + + sa = (struct sigaction) { + .sa_handler = SIG_IGN, + .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART, + }; + + /* Let the kernel reap children for us */ + (void) sigaction(SIGCHLD, &sa, NULL); + + if (arg_crash_shell) { + log_notice("Executing crash shell in 10s..."); + (void) sleep(10); + + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_struct_errno(LOG_EMERG, errno, + LOG_MESSAGE("Failed to fork off crash shell: %m"), + "MESSAGE_ID=" SD_MESSAGE_CRASH_SHELL_FORK_FAILED_STR); + else if (pid == 0) { + (void) setsid(); + (void) make_console_stdio(); + (void) rlimit_nofile_safe(); + (void) execle("/bin/sh", "/bin/sh", NULL, environ); + + log_struct_errno(LOG_EMERG, errno, + LOG_MESSAGE("execle() failed: %m"), + "MESSAGE_ID=" SD_MESSAGE_CRASH_EXECLE_FAILED_STR); + _exit(EXIT_EXCEPTION); + } else { + log_info("Spawned crash shell as PID "PID_FMT".", pid); + (void) wait_for_terminate(pid, NULL); + } + } + + freeze_or_exit_or_reboot(); +} + +void install_crash_handler(void) { + static const struct sigaction sa = { + .sa_sigaction = crash, + .sa_flags = SA_NODEFER | SA_SIGINFO, /* So that we can raise the signal again from the signal handler */ + }; + int r; + + /* We ignore the return value here, since, we don't mind if we cannot set up a crash handler */ + r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER); + if (r < 0) + log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m"); +} diff --git a/src/core/crash-handler.h b/src/core/crash-handler.h new file mode 100644 index 0000000..dc14335 --- /dev/null +++ b/src/core/crash-handler.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +_noreturn_ void freeze_or_exit_or_reboot(void); +void install_crash_handler(void); diff --git a/src/core/dbus-automount.c b/src/core/dbus-automount.c new file mode 100644 index 0000000..881bf50 --- /dev/null +++ b/src/core/dbus-automount.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "automount.h" +#include "bus-get-properties.h" +#include "dbus-automount.h" +#include "dbus-util.h" +#include "string-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, automount_result, AutomountResult); + +const sd_bus_vtable bus_automount_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Automount, where), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExtraOptions", "s", NULL, offsetof(Automount, extra_options), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Automount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Automount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutIdleUSec", "t", bus_property_get_usec, offsetof(Automount, timeout_idle_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int bus_automount_set_transient_property( + Automount *a, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(a); + + assert(a); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Where")) + return bus_set_transient_path(u, name, &a->where, message, flags, error); + + if (streq(name, "ExtraOptions")) + return bus_set_transient_string(u, name, &a->extra_options, message, flags, error); + + if (streq(name, "TimeoutIdleUSec")) + return bus_set_transient_usec_fix_0(u, name, &a->timeout_idle_usec, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &a->directory_mode, message, flags, error); + + return 0; +} + +int bus_automount_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) /* This is a transient unit? let's load a little more */ + return bus_automount_set_transient_property(a, name, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-automount.h b/src/core/dbus-automount.h new file mode 100644 index 0000000..cfceaec --- /dev/null +++ b/src/core/dbus-automount.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_automount_vtable[]; + +int bus_automount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c new file mode 100644 index 0000000..8a9570f --- /dev/null +++ b/src/core/dbus-cgroup.c @@ -0,0 +1,2287 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-foreign.h" +#include "bus-get-properties.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "core-varlink.h" +#include "dbus-cgroup.h" +#include "dbus-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "firewall-util.h" +#include "in-addr-prefix-util.h" +#include "ip-protocol-list.h" +#include "limits-util.h" +#include "memstream-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "socket-util.h" + +BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", CGroupTasksMax, cgroup_tasks_max_resolve); +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch); + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference); + +static int property_get_cgroup_mask( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupMask *mask = userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (CGroupController ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) { + if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0) + continue; + + r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_delegate_controllers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + if (!c->delegate) + return sd_bus_message_append(reply, "as", 0); + + return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error); +} + +static int property_get_cpuset( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CPUSet *cpus = ASSERT_PTR(userdata); + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + + (void) cpu_set_to_dbus(cpus, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_io_device_weight( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_weights, w, c->io_device_weights) { + r = sd_bus_message_append(reply, "(st)", w->path, w->weight); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_io_device_limits( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_limits, l, c->io_device_limits) { + CGroupIOLimitType type; + + type = cgroup_io_limit_type_from_string(property); + if (type < 0 || l->limits[type] == cgroup_io_limit_defaults[type]) + continue; + + r = sd_bus_message_append(reply, "(st)", l->path, l->limits[type]); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_io_device_latency( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) { + r = sd_bus_message_append(reply, "(st)", l->path, l->target_usec); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_blockio_device_weight( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + r = sd_bus_message_append(reply, "(st)", w->path, w->weight); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_blockio_device_bandwidths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + uint64_t v; + + if (streq(property, "BlockIOReadBandwidth")) + v = b->rbps; + else + v = b->wbps; + + if (v == CGROUP_LIMIT_MAX) + continue; + + r = sd_bus_message_append(reply, "(st)", b->path, v); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_device_allow( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(device_allow, a, c->device_allow) { + r = sd_bus_message_append(reply, "(ss)", a->path, cgroup_device_permissions_to_string(a->permissions)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_ip_address_access( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Set **prefixes = ASSERT_PTR(userdata); + struct in_addr_prefix *i; + int r; + + r = sd_bus_message_open_container(reply, 'a', "(iayu)"); + if (r < 0) + return r; + + SET_FOREACH(i, *prefixes) { + + r = sd_bus_message_open_container(reply, 'r', "iayu"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", i->family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family)); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_bpf_foreign_program( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + CGroupContext *c = userdata; + int r; + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(programs, p, c->bpf_foreign_programs) { + const char *attach_type = bpf_cgroup_attach_type_to_string(p->attach_type); + + r = sd_bus_message_append(reply, "(ss)", attach_type, p->bpffs_path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_socket_bind( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupSocketBindItem **items = ASSERT_PTR(userdata); + int r; + + r = sd_bus_message_open_container(reply, 'a', "(iiqq)"); + if (r < 0) + return r; + + LIST_FOREACH(socket_bind_items, i, *items) { + r = sd_bus_message_append(reply, "(iiqq)", i->address_family, i->ip_protocol, i->nr_ports, i->port_min); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_restrict_network_interfaces( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->restrict_network_interfaces_is_allow_list); + if (r < 0) + return r; + + r = bus_message_append_string_set(reply, c->restrict_network_interfaces); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_cgroup_nft_set( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + int r; + CGroupContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(iiss)"); + if (r < 0) + return r; + + FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) { + r = sd_bus_message_append(reply, "(iiss)", nft_set->source, nft_set->nfproto, nft_set->table, nft_set->set); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), + SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0), + SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0), + SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0), + SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0), + SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0), + SD_BUS_PROPERTY("CPUShares", "t", NULL, offsetof(CGroupContext, cpu_shares), 0), + SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0), + SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0), + SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0), + SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0), + SD_BUS_PROPERTY("StartupAllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_cpus), 0), + SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0), + SD_BUS_PROPERTY("StartupAllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_mems), 0), + SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0), + SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0), + SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0), + SD_BUS_PROPERTY("IODeviceWeight", "a(st)", property_get_io_device_weight, 0, 0), + SD_BUS_PROPERTY("IOReadBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOWriteBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOReadIOPSMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOWriteIOPSMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IODeviceLatencyTargetUSec", "a(st)", property_get_io_device_latency, 0, 0), + SD_BUS_PROPERTY("BlockIOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, blockio_accounting), 0), + SD_BUS_PROPERTY("BlockIOWeight", "t", NULL, offsetof(CGroupContext, blockio_weight), 0), + SD_BUS_PROPERTY("StartupBlockIOWeight", "t", NULL, offsetof(CGroupContext, startup_blockio_weight), 0), + SD_BUS_PROPERTY("BlockIODeviceWeight", "a(st)", property_get_blockio_device_weight, 0, 0), + SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), + SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), + SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0), + SD_BUS_PROPERTY("DefaultMemoryLow", "t", NULL, offsetof(CGroupContext, default_memory_low), 0), + SD_BUS_PROPERTY("DefaultStartupMemoryLow", "t", NULL, offsetof(CGroupContext, default_startup_memory_low), 0), + SD_BUS_PROPERTY("DefaultMemoryMin", "t", NULL, offsetof(CGroupContext, default_memory_min), 0), + SD_BUS_PROPERTY("MemoryMin", "t", NULL, offsetof(CGroupContext, memory_min), 0), + SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0), + SD_BUS_PROPERTY("StartupMemoryLow", "t", NULL, offsetof(CGroupContext, startup_memory_low), 0), + SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0), + SD_BUS_PROPERTY("StartupMemoryHigh", "t", NULL, offsetof(CGroupContext, startup_memory_high), 0), + SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0), + SD_BUS_PROPERTY("StartupMemoryMax", "t", NULL, offsetof(CGroupContext, startup_memory_max), 0), + SD_BUS_PROPERTY("MemorySwapMax", "t", NULL, offsetof(CGroupContext, memory_swap_max), 0), + SD_BUS_PROPERTY("StartupMemorySwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_swap_max), 0), + SD_BUS_PROPERTY("MemoryZSwapMax", "t", NULL, offsetof(CGroupContext, memory_zswap_max), 0), + SD_BUS_PROPERTY("StartupMemoryZSwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_zswap_max), 0), + SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0), + SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0), + SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0), + SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0), + SD_BUS_PROPERTY("TasksMax", "t", bus_property_get_tasks_max, offsetof(CGroupContext, tasks_max), 0), + SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0), + SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0), + SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0), + SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0), + SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0), + SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0), + SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0), + SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0), + SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0), + SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0), + SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0), + SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0), + SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0), + SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0), + SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0), + SD_BUS_VTABLE_END +}; + +static int bus_cgroup_set_transient_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Delegate")) { + int b; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->delegate = b; + c->delegate_controllers = b ? CGROUP_MASK_DELEGATE : 0; + + unit_write_settingf(u, flags, name, "Delegate=%s", yes_no(b)); + } + + return 1; + + } else if (streq(name, "DelegateSubgroup")) { + const char *s; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s) && cg_needs_escape(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (isempty(s)) + c->delegate_subgroup = mfree(c->delegate_subgroup); + else { + r = free_and_strdup_warn(&c->delegate_subgroup, s); + if (r < 0) + return r; + } + + unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s); + } + + return 1; + + } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) { + CGroupMask mask = 0; + + if (streq(name, "DelegateControllers") && !UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + CGroupController cc; + const char *t; + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + if (r == 0) + break; + + cc = cgroup_controller_from_string(t); + if (cc < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown cgroup controller '%s'", t); + + mask |= CGROUP_CONTROLLER_TO_MASK(cc); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *t = NULL; + + r = cg_mask_to_string(mask, &t); + if (r < 0) + return r; + + if (streq(name, "DelegateControllers")) { + + c->delegate = true; + if (mask == 0) + c->delegate_controllers = 0; + else + c->delegate_controllers |= mask; + + unit_write_settingf(u, flags, name, "Delegate=%s", strempty(t)); + + } else if (streq(name, "DisableControllers")) { + + if (mask == 0) + c->disable_controllers = 0; + else + c->disable_controllers |= mask; + + unit_write_settingf(u, flags, name, "%s=%s", name, strempty(t)); + } + } + + return 1; + } else if (STR_IN_SET(name, "IPIngressFilterPath", "IPEgressFilterPath")) { + char ***filters; + size_t n = 0; + + filters = streq(name, "IPIngressFilterPath") ? &c->ip_filters_ingress : &c->ip_filters_egress; + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *path; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + if (r == 0) + break; + + if (!path_is_normalized(path) || !path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects a normalized absolute path.", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && !strv_contains(*filters, path)) { + r = strv_extend(filters, path); + if (r < 0) + return log_oom(); + } + n++; + } + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + *filters = strv_free(*filters); + + unit_invalidate_cgroup_bpf(u); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs(name, f); + fputs("=\n", f); + + STRV_FOREACH(entry, *filters) + fprintf(f, "%s=%s\n", name, *entry); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + + if (*filters) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + static bool warned = false; + + log_full(warned ? LOG_DEBUG : LOG_WARNING, + "Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n" + "Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id); + warned = true; + } + } + } + + return 1; + } else if (streq(name, "BPFProgram")) { + const char *a, *p; + size_t n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &a, &p)) > 0) { + int attach_type = bpf_cgroup_attach_type_from_string(a); + if (attach_type < 0) + return sd_bus_error_setf( + error, + SD_BUS_ERROR_INVALID_ARGS, + "%s expects a valid BPF attach type, got '%s'.", + name, a); + + if (!path_is_normalized(p) || !path_is_absolute(p)) + return sd_bus_error_setf( + error, + SD_BUS_ERROR_INVALID_ARGS, + "%s= expects a normalized absolute path.", + name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = cgroup_context_add_bpf_foreign_program(c, attach_type, p); + if (r < 0) + return r; + } + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + while (c->bpf_foreign_programs) + cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs(name, f); + fputs("=\n", f); + + LIST_FOREACH(programs, fp, c->bpf_foreign_programs) + fprintf(f, "%s=%s:%s\n", name, + bpf_cgroup_attach_type_to_string(fp->attach_type), + fp->bpffs_path); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + + if (c->bpf_foreign_programs) { + r = bpf_foreign_supported(); + if (r < 0) + return r; + if (r == 0) + log_full(LOG_DEBUG, + "Transient unit %s configures a BPF program pinned to BPF " + "filesystem, but the local system does not support that.\n" + "Starting this unit will fail!", u->id); + } + } + + return 1; + + } else if (streq(name, "MemoryPressureWatch")) { + CGroupPressureWatch p; + const char *t; + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + p = _CGROUP_PRESSURE_WATCH_INVALID; + else { + p = cgroup_pressure_watch_from_string(t); + if (p < 0) + return p; + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->memory_pressure_watch = p; + unit_write_settingf(u, flags, name, "MemoryPressureWatch=%s", strempty(cgroup_pressure_watch_to_string(p))); + } + + return 1; + + } else if (streq(name, "MemoryPressureThresholdUSec")) { + uint64_t t; + + r = sd_bus_message_read(message, "t", &t); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->memory_pressure_threshold_usec = t; + + if (t == UINT64_MAX) + unit_write_setting(u, flags, name, "MemoryPressureThresholdUSec="); + else + unit_write_settingf(u, flags, name, "MemoryPressureThresholdUSec=%" PRIu64, t); + } + + return 1; + } else if (streq(name, "CoredumpReceive")) { + int b; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->coredump_receive = b; + + unit_write_settingf(u, flags, name, "CoredumpReceive=%s", yes_no(b)); + } + + return 1; + } + + return 0; +} + +static int bus_cgroup_set_boolean( + Unit *u, + const char *name, + bool *p, + CGroupMask mask, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int b, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = b; + unit_invalidate_cgroup(u, mask); + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(b)); + } + + return 1; +} + +#define BUS_DEFINE_SET_CGROUP_WEIGHT(function, mask, check, val) \ + static int bus_cgroup_set_##function( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "t", &v); \ + if (r < 0) \ + return r; \ + \ + if (!check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, mask); \ + \ + if (v == (val)) \ + unit_write_settingf(u, flags, name, \ + "%s=", name); \ + else \ + unit_write_settingf(u, flags, name, \ + "%s=%" PRIu64, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_CGROUP_LIMIT(function, mask, scale, minimum) \ + static int bus_cgroup_set_##function( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "t", &v); \ + if (r < 0) \ + return r; \ + \ + if (v < minimum) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, mask); \ + \ + if (v == CGROUP_LIMIT_MAX) \ + unit_write_settingf(u, flags, name, \ + "%s=infinity", name); \ + else \ + unit_write_settingf(u, flags, name, \ + "%s=%" PRIu64, name, v); \ + } \ + \ + return 1; \ + } \ + static int bus_cgroup_set_##function##_scale( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + uint32_t raw; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "u", &raw); \ + if (r < 0) \ + return r; \ + \ + v = scale(raw, UINT32_MAX); \ + if (v < minimum || v >= UINT64_MAX) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, mask); \ + \ + /* Prepare to chop off suffix */ \ + assert_se(endswith(name, "Scale")); \ + \ + int scaled = UINT32_SCALE_TO_PERMYRIAD(raw); \ + unit_write_settingf(u, flags, name, "%.*s=" PERMYRIAD_AS_PERCENT_FORMAT_STR, \ + (int)(strlen(name) - strlen("Scale")), name, \ + PERMYRIAD_AS_PERCENT_FORMAT_VAL(scaled)); \ + } \ + \ + return 1; \ + } + +DISABLE_WARNING_TYPE_LIMITS; +BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_shares, CGROUP_MASK_CPU, CGROUP_CPU_SHARES_IS_OK, CGROUP_CPU_SHARES_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(io_weight, CGROUP_MASK_IO, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEIGHT_IS_OK, CGROUP_BLKIO_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1); +BUS_DEFINE_SET_CGROUP_LIMIT(memory_protection, CGROUP_MASK_MEMORY, physical_memory_scale, 0); +BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0); +BUS_DEFINE_SET_CGROUP_LIMIT(zswap, CGROUP_MASK_MEMORY, physical_memory_scale, 0); +REENABLE_WARNING; + +static int bus_cgroup_set_cpu_weight( + Unit *u, + const char *name, + uint64_t *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + uint64_t v; + int r; + assert(p); + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + if (!CGROUP_WEIGHT_IS_OK(v) && v != CGROUP_WEIGHT_IDLE) + return sd_bus_error_setf( + error, SD_BUS_ERROR_INVALID_ARGS, "Value specified in %s is out of range", name); + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_invalidate_cgroup(u, CGROUP_MASK_CPU); + if (v == CGROUP_WEIGHT_INVALID) + unit_write_settingf(u, flags, name, "%s=", name); + else if (v == CGROUP_WEIGHT_IDLE) + unit_write_settingf(u, flags, name, "%s=idle", name); + else + unit_write_settingf(u, flags, name, "%s=%" PRIu64, name, v); + } + return 1; +} + +static int bus_cgroup_set_tasks_max( + Unit *u, + const char *name, + CGroupTasksMax *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint64_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + + if (v < 1) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Value specified in %s is out of range", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = (CGroupTasksMax) { .value = v, .scale = 0 }; /* When .scale==0, .value is the absolute value */ + unit_invalidate_cgroup(u, CGROUP_MASK_PIDS); + + if (v == CGROUP_LIMIT_MAX) + unit_write_settingf(u, flags, name, + "%s=infinity", name); + else + unit_write_settingf(u, flags, name, + "%s=%" PRIu64, name, v); + } + + return 1; +} + +static int bus_cgroup_set_tasks_max_scale( + Unit *u, + const char *name, + CGroupTasksMax *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint32_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "u", &v); + if (r < 0) + return r; + + if (v < 1 || v >= UINT32_MAX) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Value specified in %s is out of range", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = (CGroupTasksMax) { v, UINT32_MAX }; /* .scale is not 0, so this is interpreted as v/UINT32_MAX. */ + unit_invalidate_cgroup(u, CGROUP_MASK_PIDS); + + uint32_t scaled = DIV_ROUND_UP((uint64_t) v * 100U, (uint64_t) UINT32_MAX); + unit_write_settingf(u, flags, name, "%s=%" PRIu32 ".%" PRIu32 "%%", "TasksMax", + scaled / 10, scaled % 10); + } + + return 1; +} + +int bus_cgroup_set_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + CGroupIOLimitType iol_type; + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "CPUAccounting")) + return bus_cgroup_set_boolean(u, name, &c->cpu_accounting, get_cpu_accounting_mask(), message, flags, error); + + if (streq(name, "CPUWeight")) + return bus_cgroup_set_cpu_weight(u, name, &c->cpu_weight, message, flags, error); + + if (streq(name, "StartupCPUWeight")) + return bus_cgroup_set_cpu_weight(u, name, &c->startup_cpu_weight, message, flags, error); + + if (streq(name, "CPUShares")) + return bus_cgroup_set_cpu_shares(u, name, &c->cpu_shares, message, flags, error); + + if (streq(name, "StartupCPUShares")) + return bus_cgroup_set_cpu_shares(u, name, &c->startup_cpu_shares, message, flags, error); + + if (streq(name, "IOAccounting")) + return bus_cgroup_set_boolean(u, name, &c->io_accounting, CGROUP_MASK_IO, message, flags, error); + + if (streq(name, "IOWeight")) + return bus_cgroup_set_io_weight(u, name, &c->io_weight, message, flags, error); + + if (streq(name, "StartupIOWeight")) + return bus_cgroup_set_io_weight(u, name, &c->startup_io_weight, message, flags, error); + + if (streq(name, "BlockIOAccounting")) + return bus_cgroup_set_boolean(u, name, &c->blockio_accounting, CGROUP_MASK_BLKIO, message, flags, error); + + if (streq(name, "BlockIOWeight")) + return bus_cgroup_set_blockio_weight(u, name, &c->blockio_weight, message, flags, error); + + if (streq(name, "StartupBlockIOWeight")) + return bus_cgroup_set_blockio_weight(u, name, &c->startup_blockio_weight, message, flags, error); + + if (streq(name, "MemoryAccounting")) + return bus_cgroup_set_boolean(u, name, &c->memory_accounting, CGROUP_MASK_MEMORY, message, flags, error); + + if (streq(name, "MemoryMin")) { + r = bus_cgroup_set_memory_protection(u, name, &c->memory_min, message, flags, error); + if (r > 0) + c->memory_min_set = true; + return r; + } + + if (streq(name, "MemoryLow")) { + r = bus_cgroup_set_memory_protection(u, name, &c->memory_low, message, flags, error); + if (r > 0) + c->memory_low_set = true; + return r; + } + + if (streq(name, "StartupMemoryLow")) { + r = bus_cgroup_set_memory_protection(u, name, &c->startup_memory_low, message, flags, error); + if (r > 0) + c->startup_memory_low_set = true; + return r; + } + + if (streq(name, "DefaultMemoryMin")) { + r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_min, message, flags, error); + if (r > 0) + c->default_memory_min_set = true; + return r; + } + + if (streq(name, "DefaultMemoryLow")) { + r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_low, message, flags, error); + if (r > 0) + c->default_memory_low_set = true; + return r; + } + + if (streq(name, "DefaultStartupMemoryLow")) { + r = bus_cgroup_set_memory_protection(u, name, &c->default_startup_memory_low, message, flags, error); + if (r > 0) + c->default_startup_memory_low_set = true; + return r; + } + + if (streq(name, "MemoryHigh")) + return bus_cgroup_set_memory(u, name, &c->memory_high, message, flags, error); + + if (streq(name, "StartupMemoryHigh")) { + r = bus_cgroup_set_memory(u, name, &c->startup_memory_high, message, flags, error); + if (r > 0) + c->startup_memory_high_set = true; + return r; + } + + if (streq(name, "MemorySwapMax")) + return bus_cgroup_set_swap(u, name, &c->memory_swap_max, message, flags, error); + + if (streq(name, "StartupMemorySwapMax")) { + r = bus_cgroup_set_swap(u, name, &c->startup_memory_swap_max, message, flags, error); + if (r > 0) + c->startup_memory_swap_max_set = true; + return r; + } + + if (streq(name, "MemoryZSwapMax")) + return bus_cgroup_set_zswap(u, name, &c->memory_zswap_max, message, flags, error); + + if (streq(name, "StartupMemoryZSwapMax")) { + r = bus_cgroup_set_zswap(u, name, &c->startup_memory_zswap_max, message, flags, error); + if (r > 0) + c->startup_memory_zswap_max_set = true; + return r; + } + + if (streq(name, "MemoryMax")) + return bus_cgroup_set_memory(u, name, &c->memory_max, message, flags, error); + + if (streq(name, "StartupMemoryMax")) { + r = bus_cgroup_set_memory(u, name, &c->startup_memory_max, message, flags, error); + if (r > 0) + c->startup_memory_max_set = true; + return r; + } + + if (streq(name, "MemoryLimit")) + return bus_cgroup_set_memory(u, name, &c->memory_limit, message, flags, error); + + if (streq(name, "MemoryMinScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_min, message, flags, error); + if (r > 0) + c->memory_min_set = true; + return r; + } + + if (streq(name, "MemoryLowScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_low, message, flags, error); + if (r > 0) + c->memory_low_set = true; + return r; + } + + if (streq(name, "DefaultMemoryMinScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_min, message, flags, error); + if (r > 0) + c->default_memory_min_set = true; + return r; + } + + if (streq(name, "DefaultMemoryLowScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_low, message, flags, error); + if (r > 0) + c->default_memory_low_set = true; + return r; + } + + if (streq(name, "MemoryHighScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_high, message, flags, error); + + if (streq(name, "MemorySwapMaxScale")) + return bus_cgroup_set_swap_scale(u, name, &c->memory_swap_max, message, flags, error); + + if (streq(name, "MemoryZSwapMaxScale")) + return bus_cgroup_set_zswap_scale(u, name, &c->memory_zswap_max, message, flags, error); + + if (streq(name, "MemoryMaxScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_max, message, flags, error); + + if (streq(name, "MemoryLimitScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error); + + if (streq(name, "TasksAccounting")) + return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error); + + if (streq(name, "TasksMax")) + return bus_cgroup_set_tasks_max(u, name, &c->tasks_max, message, flags, error); + + if (streq(name, "TasksMaxScale")) + return bus_cgroup_set_tasks_max_scale(u, name, &c->tasks_max, message, flags, error); + + if (streq(name, "CPUQuotaPerSecUSec")) { + uint64_t u64; + + r = sd_bus_message_read(message, "t", &u64); + if (r < 0) + return r; + + if (u64 <= 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "CPUQuotaPerSecUSec= value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_quota_per_sec_usec = u64; + u->warned_clamping_cpu_quota_period = false; + unit_invalidate_cgroup(u, CGROUP_MASK_CPU); + + if (c->cpu_quota_per_sec_usec == USEC_INFINITY) + unit_write_setting(u, flags, "CPUQuota", "CPUQuota="); + else + /* config_parse_cpu_quota() requires an integer, so truncating division is used on + * purpose here. */ + unit_write_settingf(u, flags, "CPUQuota", + "CPUQuota=%0.f%%", + (double) (c->cpu_quota_per_sec_usec / 10000)); + } + + return 1; + + } else if (streq(name, "CPUQuotaPeriodUSec")) { + uint64_t u64; + + r = sd_bus_message_read(message, "t", &u64); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_quota_period_usec = u64; + u->warned_clamping_cpu_quota_period = false; + unit_invalidate_cgroup(u, CGROUP_MASK_CPU); + if (c->cpu_quota_period_usec == USEC_INFINITY) + unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec="); + else + unit_write_settingf(u, flags, "CPUQuotaPeriodSec", + "CPUQuotaPeriodSec=%s", + FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1)); + } + + return 1; + + } else if (STR_IN_SET(name, "AllowedCPUs", "StartupAllowedCPUs", "AllowedMemoryNodes", "StartupAllowedMemoryNodes")) { + const void *a; + size_t n; + _cleanup_(cpu_set_reset) CPUSet new_set = {}; + + r = sd_bus_message_read_array(message, 'y', &a, &n); + if (r < 0) + return r; + + r = cpu_set_from_dbus(a, n, &new_set); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *setstr = NULL; + CPUSet *set = NULL; + + setstr = cpu_set_to_range_string(&new_set); + if (!setstr) + return -ENOMEM; + + if (streq(name, "AllowedCPUs")) + set = &c->cpuset_cpus; + else if (streq(name, "StartupAllowedCPUs")) + set = &c->startup_cpuset_cpus; + else if (streq(name, "AllowedMemoryNodes")) + set = &c->cpuset_mems; + else if (streq(name, "StartupAllowedMemoryNodes")) + set = &c->startup_cpuset_mems; + + assert(set); + + cpu_set_reset(set); + *set = new_set; + new_set = (CPUSet) {}; + + unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET); + unit_write_settingf(u, flags, name, "%s=\n%s=%s", name, name, setstr); + } + + return 1; + + } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) { + const char *path; + unsigned n = 0; + uint64_t u64; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLimit *a = NULL; + + LIST_FOREACH(device_limits, b, c->io_device_limits) + if (path_equal(path, b->path)) { + a = b; + break; + } + + if (!a) { + CGroupIOLimitType type; + + a = new0(CGroupIODeviceLimit, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + a->limits[type] = cgroup_io_limit_defaults[type]; + + LIST_PREPEND(device_limits, c->io_device_limits, a); + } + + a->limits[iol_type] = u64; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + LIST_FOREACH(device_limits, a, c->io_device_limits) + a->limits[iol_type] = cgroup_io_limit_defaults[iol_type]; + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fprintf(f, "%s=\n", name); + LIST_FOREACH(device_limits, a, c->io_device_limits) + if (a->limits[iol_type] != cgroup_io_limit_defaults[iol_type]) + fprintf(f, "%s=%s %" PRIu64 "\n", name, a->path, a->limits[iol_type]); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IODeviceWeight")) { + const char *path; + uint64_t weight; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!CGROUP_WEIGHT_IS_OK(weight) || weight == CGROUP_WEIGHT_INVALID) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "IODeviceWeight= value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceWeight *a = NULL; + + LIST_FOREACH(device_weights, b, c->io_device_weights) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupIODeviceWeight, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_weights, c->io_device_weights, a); + } + + a->weight = weight; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs("IODeviceWeight=\n", f); + LIST_FOREACH(device_weights, a, c->io_device_weights) + fprintf(f, "IODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IODeviceLatencyTargetUSec")) { + const char *path; + uint64_t target; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &target)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLatency *a = NULL; + + LIST_FOREACH(device_latencies, b, c->io_device_latencies) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupIODeviceLatency, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_latencies, c->io_device_latencies, a); + } + + a->target_usec = target; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs("IODeviceLatencyTargetSec=\n", f); + LIST_FOREACH(device_latencies, a, c->io_device_latencies) + fprintf(f, "IODeviceLatencyTargetSec=%s %s\n", + a->path, FORMAT_TIMESPAN(a->target_usec, 1)); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) { + const char *path; + unsigned n = 0; + uint64_t u64; + bool read; + + read = streq(name, "BlockIOReadBandwidth"); + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceBandwidth *a = NULL; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + if (path_equal(path, b->path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupBlockIODeviceBandwidth, 1); + if (!a) + return -ENOMEM; + + a->rbps = CGROUP_LIMIT_MAX; + a->wbps = CGROUP_LIMIT_MAX; + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a); + } + + if (read) + a->rbps = u64; + else + a->wbps = u64; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) { + if (read) + a->rbps = CGROUP_LIMIT_MAX; + else + a->wbps = CGROUP_LIMIT_MAX; + } + + unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + if (read) { + fputs("BlockIOReadBandwidth=\n", f); + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) + if (a->rbps != CGROUP_LIMIT_MAX) + fprintf(f, "BlockIOReadBandwidth=%s %" PRIu64 "\n", a->path, a->rbps); + } else { + fputs("BlockIOWriteBandwidth=\n", f); + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) + if (a->wbps != CGROUP_LIMIT_MAX) + fprintf(f, "BlockIOWriteBandwidth=%s %" PRIu64 "\n", a->path, a->wbps); + } + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "BlockIODeviceWeight")) { + const char *path; + uint64_t weight; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight) || weight == CGROUP_BLKIO_WEIGHT_INVALID) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "BlockIODeviceWeight= out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceWeight *a = NULL; + + LIST_FOREACH(device_weights, b, c->blockio_device_weights) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupBlockIODeviceWeight, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_weights, c->blockio_device_weights, a); + } + + a->weight = weight; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs("BlockIODeviceWeight=\n", f); + LIST_FOREACH(device_weights, a, c->blockio_device_weights) + fprintf(f, "BlockIODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "DevicePolicy")) { + const char *policy; + CGroupDevicePolicy p; + + r = sd_bus_message_read(message, "s", &policy); + if (r < 0) + return r; + + p = cgroup_device_policy_from_string(policy); + if (p < 0) + return p; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->device_policy = p; + unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); + unit_write_settingf(u, flags, name, "DevicePolicy=%s", policy); + } + + return 1; + + } else if (streq(name, "DeviceAllow")) { + const char *path, *rwm; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &path, &rwm)) > 0) { + CGroupDevicePermissions p; + + if (!valid_device_allow_pattern(path) || strpbrk(path, WHITESPACE)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires device node or pattern"); + + if (isempty(rwm)) + p = _CGROUP_DEVICE_PERMISSIONS_ALL; + else { + p = cgroup_device_permissions_from_string(rwm); + if (p < 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires combination of rwm flags"); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = cgroup_context_add_or_update_device_allow(c, path, p); + if (r < 0) + return r; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs("DeviceAllow=\n", f); + LIST_FOREACH(device_allow, a, c->device_allow) + fprintf(f, "DeviceAllow=%s %s\n", a->path, cgroup_device_permissions_to_string(a->permissions)); + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IPAccounting")) { + int b; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->ip_accounting = b; + + unit_invalidate_cgroup_bpf(u); + unit_write_settingf(u, flags, name, "IPAccounting=%s", yes_no(b)); + } + + return 1; + + } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) { + _cleanup_set_free_ Set *new_prefixes = NULL; + size_t n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(iayu)"); + if (r < 0) + return r; + + for (;;) { + const void *ap; + int32_t family; + uint32_t prefixlen; + size_t an; + + r = sd_bus_message_enter_container(message, 'r', "iayu"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(message, "i", &family); + if (r < 0) + return r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects IPv4 or IPv6 addresses only.", name); + + r = sd_bus_message_read_array(message, 'y', &ap, &an); + if (r < 0) + return r; + + if (an != FAMILY_ADDRESS_SIZE(family)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IP address has wrong size for family (%s, expected %zu, got %zu)", + af_to_name(family), FAMILY_ADDRESS_SIZE(family), an); + + r = sd_bus_message_read(message, "u", &prefixlen); + if (r < 0) + return r; + + if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Prefix length %" PRIu32 " too large for address family %s.", prefixlen, af_to_name(family)); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + struct in_addr_prefix prefix = { + .family = family, + .prefixlen = prefixlen, + }; + + memcpy(&prefix.address, ap, an); + + r = in_addr_prefix_add(&new_prefixes, &prefix); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + Set **prefixes; + bool *reduced; + FILE *f; + + unit_invalidate_cgroup_bpf(u); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + prefixes = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny; + reduced = streq(name, "IPAddressAllow") ? &c->ip_address_allow_reduced : &c->ip_address_deny_reduced; + + if (n == 0) { + *reduced = true; + *prefixes = set_free(*prefixes); + fputs(name, f); + fputs("=\n", f); + } else { + *reduced = false; + + r = in_addr_prefixes_merge(prefixes, new_prefixes); + if (r < 0) + return r; + + const struct in_addr_prefix *p; + SET_FOREACH(p, new_prefixes) + fprintf(f, "%s=%s\n", name, + IN_ADDR_PREFIX_TO_STRING(p->family, &p->address, p->prefixlen)); + } + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + } + + if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) { + ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure; + ManagedOOMMode m; + const char *mode; + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = sd_bus_message_read(message, "s", &mode); + if (r < 0) + return r; + + m = managed_oom_mode_from_string(mode); + if (m < 0) + return -EINVAL; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *cgroup_mode = m; + unit_write_settingf(u, flags, name, "%s=%s", name, mode); + } + + (void) manager_varlink_send_managed_oom_update(u); + return 1; + } + + if (streq(name, "ManagedOOMMemoryPressureLimit")) { + uint32_t v; + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = sd_bus_message_read(message, "u", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->moom_mem_pressure_limit = v; + unit_write_settingf(u, flags, name, + "ManagedOOMMemoryPressureLimit=" PERMYRIAD_AS_PERCENT_FORMAT_STR, + PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(v))); + } + + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + + return 1; + } + + if (streq(name, "ManagedOOMPreference")) { + ManagedOOMPreference p; + const char *pref; + + r = sd_bus_message_read(message, "s", &pref); + if (r < 0) + return r; + + p = managed_oom_preference_from_string(pref); + if (p < 0) + return p; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->moom_preference = p; + unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref); + } + + return 1; + } + if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) { + CGroupSocketBindItem **list; + uint16_t nr_ports, port_min; + size_t n = 0; + int32_t family, ip_protocol; + + list = streq(name, "SocketBindAllow") ? &c->socket_bind_allow : &c->socket_bind_deny; + + r = sd_bus_message_enter_container(message, 'a', "(iiqq)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(iiqq)", &family, &ip_protocol, &nr_ports, &port_min)) > 0) { + + if (!IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects INET or INET6 family, if specified.", name); + + if (!IN_SET(ip_protocol, 0, IPPROTO_TCP, IPPROTO_UDP)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects TCP or UDP protocol, if specified.", name); + + if (port_min + (uint32_t) nr_ports > (1 << 16)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects maximum port value lesser than 65536.", name); + + if (port_min == 0 && nr_ports != 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects port range starting with positive value.", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ CGroupSocketBindItem *item = NULL; + + item = new(CGroupSocketBindItem, 1); + if (!item) + return log_oom(); + + *item = (CGroupSocketBindItem) { + .address_family = family, + .ip_protocol = ip_protocol, + .nr_ports = nr_ports, + .port_min = port_min + }; + + LIST_PREPEND(socket_bind_items, *list, TAKE_PTR(item)); + } + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + cgroup_context_remove_socket_bind(list); + else { + if ((u->manager->cgroup_supported & CGROUP_MASK_BPF_SOCKET_BIND) == 0) + log_full(LOG_DEBUG, + "Unit %s configures source compiled BPF programs " + "but the local system does not support that.\n" + "Starting this unit will fail!", u->id); + } + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + if (n == 0) + fprintf(f, "%s=\n", name); + else + LIST_FOREACH(socket_bind_items, item, *list) { + fprintf(f, "%s=", name); + cgroup_context_dump_socket_bind_item(item, f); + fputc('\n', f); + } + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + } + if (streq(name, "RestrictNetworkInterfaces")) { + int is_allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &is_allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + + if (strv_isempty(l)) { + c->restrict_network_interfaces_is_allow_list = false; + c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces); + + unit_write_settingf(u, flags, name, "%s=", name); + return 1; + } + + if (set_isempty(c->restrict_network_interfaces)) + c->restrict_network_interfaces_is_allow_list = is_allow_list; + + STRV_FOREACH(s, l) { + if (!ifname_valid(*s)) { + log_full(LOG_WARNING, "Invalid interface name, ignoring: %s", *s); + continue; + } + if (c->restrict_network_interfaces_is_allow_list != (bool) is_allow_list) + free(set_remove(c->restrict_network_interfaces, *s)); + else { + r = set_put_strdup(&c->restrict_network_interfaces, *s); + if (r < 0) + return log_oom(); + } + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s%s", name, is_allow_list ? "" : "~", joined); + } + + return 1; + } + + if (streq(name, "NFTSet")) { + int source, nfproto; + const char *table, *set; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(iiss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(iiss)", &source, &nfproto, &table, &set)) > 0) { + const char *source_name, *nfproto_name; + + if (!IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid source %d.", source); + + source_name = nft_set_source_to_string(source); + assert(source_name); + + nfproto_name = nfproto_to_string(nfproto); + if (!nfproto_name) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid protocol %d.", nfproto); + + if (!nft_identifier_valid(table)) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(table); + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NFT table name %s.", strna(esc)); + } + + if (!nft_identifier_valid(set)) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(set); + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NFT set name %s.", strna(esc)); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = nft_set_add(&c->nft_set_context, source, nfproto, table, set); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s:%s:%s:%s", + name, + source_name, + nfproto_name, + table, + set); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty && !UNIT_WRITE_FLAGS_NOOP(flags)) { + nft_set_context_clear(&c->nft_set_context); + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + } + + /* must be last */ + if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB)) + return bus_cgroup_set_transient_property(u, c, name, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h new file mode 100644 index 0000000..dd0d5da --- /dev/null +++ b/src/core/dbus-cgroup.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" +#include "cgroup.h" + +extern const sd_bus_vtable bus_cgroup_vtable[]; + +int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_cgroup_pressure_watch(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); + +int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-device.c b/src/core/dbus-device.c new file mode 100644 index 0000000..b5e18d8 --- /dev/null +++ b/src/core/dbus-device.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-device.h" +#include "device.h" +#include "unit.h" + +const sd_bus_vtable bus_device_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("SysFSPath", "s", NULL, offsetof(Device, sysfs), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; diff --git a/src/core/dbus-device.h b/src/core/dbus-device.h new file mode 100644 index 0000000..bfb5770 --- /dev/null +++ b/src/core/dbus-device.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus-vtable.h" + +extern const sd_bus_vtable bus_device_vtable[]; diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c new file mode 100644 index 0000000..2d05ba7 --- /dev/null +++ b/src/core/dbus-execute.c @@ -0,0 +1,3758 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include "af-list.h" +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cpu-set-util.h" +#include "creds-util.h" +#include "dbus-execute.h" +#include "dbus-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "exec-credential.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "iovec-util.h" +#include "ioprio-util.h" +#include "journal-file.h" +#include "load-fragment.h" +#include "memstream-util.h" +#include "missing_ioprio.h" +#include "mountpoint-util.h" +#include "namespace.h" +#include "parse-util.h" +#include "path-util.h" +#include "pcre2-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "seccomp-util.h" +#include "securebits-util.h" +#include "specifier.h" +#include "stat-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "unit-printf.h" +#include "user-util.h" +#include "utf8.h" + +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutput); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode); +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long); +static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio); +static BUS_DEFINE_PROPERTY_GET(property_get_mount_apivfs, "b", ExecContext, exec_context_get_effective_mount_apivfs); +static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_class, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_class); +static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_data); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); +static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa); +static BUS_DEFINE_PROPERTY_GET(property_get_oom_score_adjust, "i", ExecContext, exec_context_get_oom_score_adjust); +static BUS_DEFINE_PROPERTY_GET(property_get_nice, "i", ExecContext, exec_context_get_nice); +static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_policy, "i", ExecContext, exec_context_get_cpu_sched_policy); +static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_priority, "i", ExecContext, exec_context_get_cpu_sched_priority); +static BUS_DEFINE_PROPERTY_GET(property_get_coredump_filter, "t", ExecContext, exec_context_get_coredump_filter); +static BUS_DEFINE_PROPERTY_GET(property_get_timer_slack_nsec, "t", ExecContext, exec_context_get_timer_slack_nsec); + +static int property_get_environment_files( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sb)"); + if (r < 0) + return r; + + STRV_FOREACH(j, c->environment_files) { + const char *fn = *j; + + r = sd_bus_message_append(reply, "(sb)", fn[0] == '-' ? fn + 1 : fn, fn[0] == '-'); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_cpu_affinity( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_(cpu_set_reset) CPUSet s = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + + if (c->cpu_affinity_from_numa) { + int r; + + r = numa_to_cpu_set(&c->numa_policy, &s); + if (r < 0) + return r; + } + + (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set, &array, &allocated); + + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_numa_mask( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + + (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated); + + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_numa_policy( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + ExecContext *c = ASSERT_PTR(userdata); + int32_t policy; + + assert(bus); + assert(reply); + + policy = numa_policy_get_type(&c->numa_policy); + + return sd_bus_message_append_basic(reply, 'i', &policy); +} + +static int property_get_syscall_filter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->syscall_allow_list); + if (r < 0) + return r; + + l = exec_context_get_syscall_filter(c); + if (!l) + return -ENOMEM; + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_syscall_log( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->syscall_log_allow_list); + if (r < 0) + return r; + + l = exec_context_get_syscall_log(c); + if (!l) + return -ENOMEM; + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_syscall_archs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(bus); + assert(reply); + + l = exec_context_get_syscall_archs(c); + if (!l) + return -ENOMEM; + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return 0; +} + +static int property_get_selinux_context( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "(bs)", c->selinux_context_ignore, c->selinux_context); +} + +static int property_get_apparmor_profile( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "(bs)", c->apparmor_profile_ignore, c->apparmor_profile); +} + +static int property_get_smack_process_label( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "(bs)", c->smack_process_label_ignore, c->smack_process_label); +} + +static int property_get_address_families( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->address_families_allow_list); + if (r < 0) + return r; + + l = exec_context_get_address_families(c); + if (!l) + return -ENOMEM; + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_working_directory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + const char *wd; + + assert(bus); + assert(reply); + + if (c->working_directory_home) + wd = "~"; + else + wd = c->working_directory; + + if (c->working_directory_missing_ok) + wd = strjoina("!", wd); + + return sd_bus_message_append(reply, "s", wd); +} + +static int property_get_stdio_fdname( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int fileno; + + assert(bus); + assert(property); + assert(reply); + + if (streq(property, "StandardInputFileDescriptorName")) + fileno = STDIN_FILENO; + else if (streq(property, "StandardOutputFileDescriptorName")) + fileno = STDOUT_FILENO; + else { + assert(streq(property, "StandardErrorFileDescriptorName")); + fileno = STDERR_FILENO; + } + + return sd_bus_message_append(reply, "s", exec_context_fdname(c, fileno)); +} + +static int property_get_input_data( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size); +} + +static int property_get_restrict_filesystems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + _cleanup_free_ char **l = NULL; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->restrict_filesystems_allow_list); + if (r < 0) + return r; + + l = exec_context_get_restrict_filesystems(c); + if (!l) + return -ENOMEM; + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_bind_paths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + bool ro; + int r; + + assert(bus); + assert(property); + assert(reply); + + ro = strstr(property, "ReadOnly"); + + r = sd_bus_message_open_container(reply, 'a', "(ssbt)"); + if (r < 0) + return r; + + for (size_t i = 0; i < c->n_bind_mounts; i++) { + + if (ro != c->bind_mounts[i].read_only) + continue; + + r = sd_bus_message_append( + reply, "(ssbt)", + c->bind_mounts[i].source, + c->bind_mounts[i].destination, + c->bind_mounts[i].ignore_enoent, + c->bind_mounts[i].recursive ? (uint64_t) MS_REC : UINT64_C(0)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_temporary_filesystems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + for (unsigned i = 0; i < c->n_temporary_filesystems; i++) { + TemporaryFileSystem *t = c->temporary_filesystems + i; + + r = sd_bus_message_append( + reply, "(ss)", + t->path, + t->options); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_log_extra_fields( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "ay"); + if (r < 0) + return r; + + for (size_t i = 0; i < c->n_log_extra_fields; i++) { + r = sd_bus_message_append_array(reply, 'y', c->log_extra_fields[i].iov_base, c->log_extra_fields[i].iov_len); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int sd_bus_message_append_log_filter_patterns(sd_bus_message *reply, Set *patterns, bool is_allowlist) { + const char *pattern; + int r; + + assert(reply); + + SET_FOREACH(pattern, patterns) { + r = sd_bus_message_append(reply, "(bs)", is_allowlist, pattern); + if (r < 0) + return r; + } + + return 0; +} + +static int property_get_log_filter_patterns( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int r; + + assert(c); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(bs)"); + if (r < 0) + return r; + + r = sd_bus_message_append_log_filter_patterns(reply, c->log_filter_allowed_patterns, + /* is_allowlist = */ true); + if (r < 0) + return r; + + r = sd_bus_message_append_log_filter_patterns(reply, c->log_filter_denied_patterns, + /* is_allowlist = */ false); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_set_credential( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + ExecSetCredential *sc; + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(say)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(sc, c->set_credentials) { + + if (sc->encrypted != streq(property, "SetCredentialEncrypted")) + continue; + + r = sd_bus_message_open_container(reply, 'r', "say"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", sc->id); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', sc->data, sc->size); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_load_credential( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + ExecLoadCredential *lc; + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(lc, c->load_credentials) { + + if (lc->encrypted != streq(property, "LoadCredentialEncrypted")) + continue; + + r = sd_bus_message_append(reply, "(ss)", lc->id, lc->path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_root_hash( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->root_hash, c->root_hash_size); +} + +static int property_get_root_hash_sig( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + + assert(bus); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->root_hash_sig, c->root_hash_sig_size); +} + +static int property_get_root_image_options( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(mount_options, m, c->root_image_options) { + r = sd_bus_message_append(reply, "(ss)", + partition_designator_to_string(m->partition_designator), + m->options); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_mount_images( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ssba(ss))"); + if (r < 0) + return r; + + for (size_t i = 0; i < c->n_mount_images; i++) { + r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "ssba(ss)"); + if (r < 0) + return r; + r = sd_bus_message_append( + reply, "ssb", + c->mount_images[i].source, + c->mount_images[i].destination, + c->mount_images[i].ignore_enoent); + if (r < 0) + return r; + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + LIST_FOREACH(mount_options, m, c->mount_images[i].mount_options) { + r = sd_bus_message_append(reply, "(ss)", + partition_designator_to_string(m->partition_designator), + m->options); + if (r < 0) + return r; + } + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_extension_images( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sba(ss))"); + if (r < 0) + return r; + + for (size_t i = 0; i < c->n_extension_images; i++) { + r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "sba(ss)"); + if (r < 0) + return r; + r = sd_bus_message_append( + reply, "sb", + c->extension_images[i].source, + c->extension_images[i].ignore_enoent); + if (r < 0) + return r; + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + LIST_FOREACH(mount_options, m, c->extension_images[i].mount_options) { + r = sd_bus_message_append(reply, "(ss)", + partition_designator_to_string(m->partition_designator), + m->options); + if (r < 0) + return r; + } + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int bus_property_get_exec_dir( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecDirectory *d = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (size_t i = 0; i < d->n_items; i++) { + r = sd_bus_message_append_basic(reply, 's', d->items[i].path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int bus_property_get_exec_dir_symlink( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecDirectory *d = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sst)"); + if (r < 0) + return r; + + for (size_t i = 0; i < d->n_items; i++) + STRV_FOREACH(dst, d->items[i].symlinks) { + r = sd_bus_message_append(reply, "(sst)", d->items[i].path, *dst, UINT64_C(0) /* flags, unused for now */); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_image_policy( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ImagePolicy **pp = ASSERT_PTR(userdata); + _cleanup_free_ char *s = NULL; + int r; + + assert(bus); + assert(property); + assert(reply); + + r = image_policy_to_string(*pp ?: &image_policy_service, /* simplify= */ true, &s); + if (r < 0) + return r; + + return sd_bus_message_append(reply, "s", s); +} + +const sd_bus_vtable bus_exec_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("EnvironmentFiles", "a(sb)", property_get_environment_files, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassEnvironment", "as", NULL, offsetof(ExecContext, pass_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UnsetEnvironment", "as", NULL, offsetof(ExecContext, unset_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UMask", "u", bus_property_get_mode, offsetof(ExecContext, umask), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCPU", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCPUSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitFSIZE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitDATA", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitDATASoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSTACK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCORE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCORESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRSS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRSSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNOFILE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitAS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitASSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNPROC", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitLOCKS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNICE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNICESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTPRIO", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTTIME", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImageOptions", "a(ss)", property_get_root_image_options, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHash", "ay", property_get_root_hash, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHashPath", "s", NULL, offsetof(ExecContext, root_hash_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CoredumpFilter", "t", property_get_coredump_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IOSchedulingClass", "i", property_get_ioprio_class, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IOSchedulingPriority", "i", property_get_ioprio_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputData", "ay", property_get_input_data, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYVTDisallocate", "b", bus_property_get_bool, offsetof(ExecContext, tty_vt_disallocate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYRows", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_rows), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYColumns", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_cols), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogPriority", "i", bus_property_get_int, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogIdentifier", "s", NULL, offsetof(ExecContext, syslog_identifier), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogLevelPrefix", "b", bus_property_get_bool, offsetof(ExecContext, syslog_level_prefix), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogLevel", "i", property_get_syslog_level, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogFacility", "i", property_get_syslog_facility, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogLevelMax", "i", bus_property_get_int, offsetof(ExecContext, log_level_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogRateLimitIntervalUSec", "t", bus_property_get_usec, offsetof(ExecContext, log_ratelimit_interval_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogRateLimitBurst", "u", bus_property_get_unsigned, offsetof(ExecContext, log_ratelimit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogExtraFields", "aay", property_get_log_extra_fields, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogFilterPatterns", "a(bs)", property_get_log_filter_patterns, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogNamespace", "s", NULL, offsetof(ExecContext, log_namespace), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SecureBits", "i", bus_property_get_int, offsetof(ExecContext, secure_bits), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CapabilityBoundingSet", "t", NULL, offsetof(ExecContext, capability_bounding_set), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AmbientCapabilities", "t", NULL, offsetof(ExecContext, capability_ambient_set), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SetLoginEnvironment", "b", bus_property_get_tristate, offsetof(ExecContext, set_login_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SetCredentialEncrypted", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LoadCredential", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LoadCredentialEncrypted", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ImportCredential", "as", bus_property_get_string_set, offsetof(ExecContext, import_credentials), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadWritePaths", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadOnlyPaths", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExecPaths", "as", NULL, offsetof(ExecContext, exec_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoExecPaths", "as", NULL, offsetof(ExecContext, no_exec_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExecSearchPath", "as", NULL, offsetof(ExecContext, exec_search_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_propagation_flag), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UtmpMode", "s", property_get_exec_utmp_mode, offsetof(ExecContext, utmp_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AppArmorProfile", "(bs)", property_get_apparmor_profile, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackProcessLabel", "(bs)", property_get_smack_process_label, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IgnoreSIGPIPE", "b", bus_property_get_bool, offsetof(ExecContext, ignore_sigpipe), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoNewPrivileges", "b", bus_property_get_bool, offsetof(ExecContext, no_new_privileges), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallFilter", "(bas)", property_get_syscall_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallErrorNumber", "i", bus_property_get_int, offsetof(ExecContext, syscall_errno), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallLog", "(bas)", property_get_syscall_log, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Personality", "s", property_get_personality, offsetof(ExecContext, personality), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", bus_property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfigurationDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfigurationDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutCleanUSec", "t", bus_property_get_usec, offsetof(ExecContext, timeout_clean_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountAPIVFS", "b", property_get_mount_apivfs, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, root_image_policy), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, mount_image_policy), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExtensionImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, extension_image_policy), SD_BUS_VTABLE_PROPERTY_CONST), + + /* Obsolete/redundant properties: */ + SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("ReadOnlyDirectories", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("InaccessibleDirectories", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + + SD_BUS_VTABLE_END +}; + +static int append_exec_command(sd_bus_message *reply, ExecCommand *c) { + int r; + + assert(reply); + assert(c); + + if (!c->path) + return 0; + + r = sd_bus_message_open_container(reply, 'r', "sasbttttuii"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", c->path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, c->argv); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "bttttuii", + !!(c->flags & EXEC_COMMAND_IGNORE_FAILURE), + c->exec_status.start_timestamp.realtime, + c->exec_status.start_timestamp.monotonic, + c->exec_status.exit_timestamp.realtime, + c->exec_status.exit_timestamp.monotonic, + (uint32_t) c->exec_status.pid, + (int32_t) c->exec_status.code, + (int32_t) c->exec_status.status); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int append_exec_ex_command(sd_bus_message *reply, ExecCommand *c) { + _cleanup_strv_free_ char **ex_opts = NULL; + int r; + + assert(reply); + assert(c); + + if (!c->path) + return 0; + + r = sd_bus_message_open_container(reply, 'r', "sasasttttuii"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", c->path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, c->argv); + if (r < 0) + return r; + + r = exec_command_flags_to_strv(c->flags, &ex_opts); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, ex_opts); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "ttttuii", + c->exec_status.start_timestamp.realtime, + c->exec_status.start_timestamp.monotonic, + c->exec_status.exit_timestamp.realtime, + c->exec_status.exit_timestamp.monotonic, + (uint32_t) c->exec_status.pid, + (int32_t) c->exec_status.code, + (int32_t) c->exec_status.status); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_command( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *c = (ExecCommand*) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)"); + if (r < 0) + return r; + + r = append_exec_command(reply, c); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_command_list( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *exec_command = *(ExecCommand**) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)"); + if (r < 0) + return r; + + LIST_FOREACH(command, c, exec_command) { + r = append_exec_command(reply, c); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_ex_command_list( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *exec_command = *(ExecCommand**) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasasttttuii)"); + if (r < 0) + return r; + + LIST_FOREACH(command, c, exec_command) { + r = append_exec_ex_command(reply, c); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static char *exec_command_flags_to_exec_chars(ExecCommandFlags flags) { + return strjoin(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE) ? "-" : "", + FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND) ? ":" : "", + FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) ? "+" : "", + FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID) ? "!" : "", + FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC) ? "!!" : ""); +} + +int bus_set_transient_exec_command( + Unit *u, + const char *name, + ExecCommand **exec_command, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + bool is_ex_prop = endswith(name, "Ex"); + unsigned n = 0; + int r; + + /* Drop Ex from the written setting. E.g. ExecStart=, not ExecStartEx=. */ + const char *written_name = is_ex_prop ? strndupa(name, strlen(name) - 2) : name; + + r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) { + _cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL; + const char *path; + int b; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + if (!path_is_absolute(path) && !filename_is_valid(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "\"%s\" is neither a valid executable name nor an absolute path", + path); + + r = sd_bus_message_read_strv(message, &argv); + if (r < 0) + return r; + + if (strv_isempty(argv)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "\"%s\" argv cannot be empty", name); + + r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + ExecCommand *c; + + c = new0(ExecCommand, 1); + if (!c) + return -ENOMEM; + + c->path = strdup(path); + if (!c->path) { + free(c); + return -ENOMEM; + } + + c->argv = TAKE_PTR(argv); + + if (is_ex_prop) { + r = exec_command_flags_from_strv(ex_opts, &c->flags); + if (r < 0) + return r; + } else + c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0; + + path_simplify(c->path); + exec_command_append_list(exec_command, c); + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + FILE *f; + + if (n == 0) + *exec_command = exec_command_free_list(*exec_command); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fprintf(f, "%s=\n", written_name); + + LIST_FOREACH(command, c, *exec_command) { + _cleanup_free_ char *a = NULL, *exec_chars = NULL; + UnitWriteFlags esc_flags = UNIT_ESCAPE_SPECIFIERS | + (FLAGS_SET(c->flags, EXEC_COMMAND_NO_ENV_EXPAND) ? UNIT_ESCAPE_EXEC_SYNTAX : UNIT_ESCAPE_EXEC_SYNTAX_ENV); + + exec_chars = exec_command_flags_to_exec_chars(c->flags); + if (!exec_chars) + return -ENOMEM; + + a = unit_concat_strv(c->argv, esc_flags); + if (!a) + return -ENOMEM; + + if (streq_ptr(c->path, c->argv ? c->argv[0] : NULL)) + fprintf(f, "%s=%s%s\n", written_name, exec_chars, a); + else { + _cleanup_free_ char *t = NULL; + const char *p; + + p = unit_escape_setting(c->path, esc_flags, &t); + if (!p) + return -ENOMEM; + + fprintf(f, "%s=%s@%s %s\n", written_name, exec_chars, p, a); + } + } + + r = memstream_finalize(&m, &buf, NULL); + if (r < 0) + return r; + + unit_write_setting(u, flags, written_name, buf); + } + + return 1; +} + +static int parse_personality(const char *s, unsigned long *p) { + unsigned long v; + + assert(p); + + v = personality_from_string(s); + if (v == PERSONALITY_INVALID) + return -EINVAL; + + *p = v; + return 0; +} + +static const char* mount_propagation_flag_to_string_with_check(unsigned long n) { + if (!mount_propagation_flag_is_valid(n)) + return NULL; + + return mount_propagation_flag_to_string(n); +} + +static BUS_DEFINE_SET_TRANSIENT(nsec, "t", uint64_t, nsec_t, NSEC_FMT); +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(log_level, "i", int32_t, int, "%" PRIi32, log_level_is_valid); +#if HAVE_SECCOMP +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(errno, "i", int32_t, int, "%" PRIi32, seccomp_errno_or_action_is_valid); +#endif +static BUS_DEFINE_SET_TRANSIENT_PARSE(std_input, ExecInput, exec_input_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(std_output, ExecOutput, exec_output_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string); +BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(capability, "t", uint64_t, uint64_t, "%" PRIu64, capability_set_to_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(namespace_flag, "t", uint64_t, unsigned long, "%" PRIu64, namespace_flags_to_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(mount_propagation_flag, "t", uint64_t, unsigned long, "%" PRIu64, mount_propagation_flag_to_string_with_check); + +int bus_exec_context_set_transient_property( + Unit *u, + ExecContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *suffix; + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "User")) + return bus_set_transient_user_relaxed(u, name, &c->user, message, flags, error); + + if (streq(name, "Group")) + return bus_set_transient_user_relaxed(u, name, &c->group, message, flags, error); + + if (streq(name, "SetLoginEnvironment")) + return bus_set_transient_tristate(u, name, &c->set_login_environment, message, flags, error); + + if (streq(name, "TTYPath")) + return bus_set_transient_path(u, name, &c->tty_path, message, flags, error); + + if (streq(name, "RootImage")) + return bus_set_transient_path(u, name, &c->root_image, message, flags, error); + + if (streq(name, "RootImageOptions")) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *format_str = NULL; + + r = bus_read_mount_options(message, error, &options, &format_str, " "); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (options) { + LIST_JOIN(mount_options, c->root_image_options, options); + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", + name, + format_str); + } else { + c->root_image_options = mount_options_free_all(c->root_image_options); + unit_write_settingf(u, flags, name, "%s=", name); + } + } + + return 1; + } + + if (streq(name, "RootHash")) { + const void *roothash_decoded; + size_t roothash_decoded_size; + + r = sd_bus_message_read_array(message, 'y', &roothash_decoded, &roothash_decoded_size); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (roothash_decoded_size == 0) { + c->root_hash_path = mfree(c->root_hash_path); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + + unit_write_settingf(u, flags, name, "RootHash="); + } else { + _cleanup_free_ void *p = NULL; + + encoded = hexmem(roothash_decoded, roothash_decoded_size); + if (!encoded) + return -ENOMEM; + + p = memdup(roothash_decoded, roothash_decoded_size); + if (!p) + return -ENOMEM; + + free_and_replace(c->root_hash, p); + c->root_hash_size = roothash_decoded_size; + c->root_hash_path = mfree(c->root_hash_path); + + unit_write_settingf(u, flags, name, "RootHash=%s", encoded); + } + } + + return 1; + } + + if (streq(name, "RootHashPath")) { + c->root_hash_size = 0; + c->root_hash = mfree(c->root_hash); + + return bus_set_transient_path(u, "RootHash", &c->root_hash_path, message, flags, error); + } + + if (streq(name, "RootHashSignature")) { + const void *roothash_sig_decoded; + size_t roothash_sig_decoded_size; + + r = sd_bus_message_read_array(message, 'y', &roothash_sig_decoded, &roothash_sig_decoded_size); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (roothash_sig_decoded_size == 0) { + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + + unit_write_settingf(u, flags, name, "RootHashSignature="); + } else { + _cleanup_free_ void *p = NULL; + ssize_t len; + + len = base64mem(roothash_sig_decoded, roothash_sig_decoded_size, &encoded); + if (len < 0) + return -ENOMEM; + + p = memdup(roothash_sig_decoded, roothash_sig_decoded_size); + if (!p) + return -ENOMEM; + + free_and_replace(c->root_hash_sig, p); + c->root_hash_sig_size = roothash_sig_decoded_size; + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + + unit_write_settingf(u, flags, name, "RootHashSignature=base64:%s", encoded); + } + } + + return 1; + } + + if (streq(name, "RootHashSignaturePath")) { + c->root_hash_sig_size = 0; + c->root_hash_sig = mfree(c->root_hash_sig); + + return bus_set_transient_path(u, "RootHashSignature", &c->root_hash_sig_path, message, flags, error); + } + + if (streq(name, "RootVerity")) + return bus_set_transient_path(u, name, &c->root_verity, message, flags, error); + + if (streq(name, "RootDirectory")) + return bus_set_transient_path(u, name, &c->root_directory, message, flags, error); + + if (streq(name, "RootEphemeral")) + return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error); + + if (streq(name, "SyslogIdentifier")) + return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error); + + if (streq(name, "LogLevelMax")) + return bus_set_transient_log_level(u, name, &c->log_level_max, message, flags, error); + + if (streq(name, "LogRateLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &c->log_ratelimit_interval_usec, message, flags, error); + + if (streq(name, "LogRateLimitBurst")) + return bus_set_transient_unsigned(u, name, &c->log_ratelimit_burst, message, flags, error); + + if (streq(name, "LogFilterPatterns")) { + /* Use _cleanup_free_, not _cleanup_strv_free_, as we don't want the content of the strv + * to be freed. */ + _cleanup_free_ char **allow_list = NULL, **deny_list = NULL; + const char *pattern; + int is_allowlist; + + r = sd_bus_message_enter_container(message, 'a', "(bs)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(bs)", &is_allowlist, &pattern)) > 0) { + _cleanup_(pattern_freep) pcre2_code *compiled_pattern = NULL; + + if (isempty(pattern)) + continue; + + r = pattern_compile_and_log(pattern, 0, &compiled_pattern); + if (r < 0) + return r; + + r = strv_push(is_allowlist ? &allow_list : &deny_list, (char *)pattern); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(allow_list) && strv_isempty(deny_list)) { + c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns); + c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + r = set_put_strdupv(&c->log_filter_allowed_patterns, allow_list); + if (r < 0) + return r; + r = set_put_strdupv(&c->log_filter_denied_patterns, deny_list); + if (r < 0) + return r; + + STRV_FOREACH(unit_pattern, allow_list) + unit_write_settingf(u, flags, name, "%s=%s", name, *unit_pattern); + STRV_FOREACH(unit_pattern, deny_list) + unit_write_settingf(u, flags, name, "%s=~%s", name, *unit_pattern); + } + } + + return 1; + } + + if (streq(name, "Personality")) + return bus_set_transient_personality(u, name, &c->personality, message, flags, error); + + if (streq(name, "StandardInput")) + return bus_set_transient_std_input(u, name, &c->std_input, message, flags, error); + + if (streq(name, "StandardOutput")) + return bus_set_transient_std_output(u, name, &c->std_output, message, flags, error); + + if (streq(name, "StandardError")) + return bus_set_transient_std_output(u, name, &c->std_error, message, flags, error); + + if (streq(name, "IgnoreSIGPIPE")) + return bus_set_transient_bool(u, name, &c->ignore_sigpipe, message, flags, error); + + if (streq(name, "TTYVHangup")) + return bus_set_transient_bool(u, name, &c->tty_vhangup, message, flags, error); + + if (streq(name, "TTYReset")) + return bus_set_transient_bool(u, name, &c->tty_reset, message, flags, error); + + if (streq(name, "TTYVTDisallocate")) + return bus_set_transient_bool(u, name, &c->tty_vt_disallocate, message, flags, error); + + if (streq(name, "TTYRows")) + return bus_set_transient_unsigned(u, name, &c->tty_rows, message, flags, error); + + if (streq(name, "TTYColumns")) + return bus_set_transient_unsigned(u, name, &c->tty_cols, message, flags, error); + + if (streq(name, "PrivateTmp")) + return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error); + + if (streq(name, "PrivateDevices")) + return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); + + if (streq(name, "PrivateMounts")) + return bus_set_transient_tristate(u, name, &c->private_mounts, message, flags, error); + + if (streq(name, "PrivateNetwork")) + return bus_set_transient_bool(u, name, &c->private_network, message, flags, error); + + if (streq(name, "PrivateIPC")) + return bus_set_transient_bool(u, name, &c->private_ipc, message, flags, error); + + if (streq(name, "PrivateUsers")) + return bus_set_transient_bool(u, name, &c->private_users, message, flags, error); + + if (streq(name, "NoNewPrivileges")) + return bus_set_transient_bool(u, name, &c->no_new_privileges, message, flags, error); + + if (streq(name, "SyslogLevelPrefix")) + return bus_set_transient_bool(u, name, &c->syslog_level_prefix, message, flags, error); + + if (streq(name, "MemoryDenyWriteExecute")) + return bus_set_transient_bool(u, name, &c->memory_deny_write_execute, message, flags, error); + + if (streq(name, "RestrictRealtime")) + return bus_set_transient_bool(u, name, &c->restrict_realtime, message, flags, error); + + if (streq(name, "RestrictSUIDSGID")) + return bus_set_transient_bool(u, name, &c->restrict_suid_sgid, message, flags, error); + + if (streq(name, "DynamicUser")) + return bus_set_transient_bool(u, name, &c->dynamic_user, message, flags, error); + + if (streq(name, "RemoveIPC")) + return bus_set_transient_bool(u, name, &c->remove_ipc, message, flags, error); + + if (streq(name, "ProtectKernelTunables")) + return bus_set_transient_bool(u, name, &c->protect_kernel_tunables, message, flags, error); + + if (streq(name, "ProtectKernelModules")) + return bus_set_transient_bool(u, name, &c->protect_kernel_modules, message, flags, error); + + if (streq(name, "ProtectKernelLogs")) + return bus_set_transient_bool(u, name, &c->protect_kernel_logs, message, flags, error); + + if (streq(name, "ProtectClock")) + return bus_set_transient_bool(u, name, &c->protect_clock, message, flags, error); + + if (streq(name, "ProtectControlGroups")) + return bus_set_transient_bool(u, name, &c->protect_control_groups, message, flags, error); + + if (streq(name, "CPUSchedulingResetOnFork")) + return bus_set_transient_bool(u, name, &c->cpu_sched_reset_on_fork, message, flags, error); + + if (streq(name, "NonBlocking")) + return bus_set_transient_bool(u, name, &c->non_blocking, message, flags, error); + + if (streq(name, "LockPersonality")) + return bus_set_transient_bool(u, name, &c->lock_personality, message, flags, error); + + if (streq(name, "ProtectHostname")) + return bus_set_transient_bool(u, name, &c->protect_hostname, message, flags, error); + + if (streq(name, "MemoryKSM")) + return bus_set_transient_tristate(u, name, &c->memory_ksm, message, flags, error); + + if (streq(name, "UtmpIdentifier")) + return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error); + + if (streq(name, "UtmpMode")) + return bus_set_transient_utmp_mode(u, name, &c->utmp_mode, message, flags, error); + + if (streq(name, "PAMName")) + return bus_set_transient_string(u, name, &c->pam_name, message, flags, error); + + if (streq(name, "TimerSlackNSec")) + return bus_set_transient_nsec(u, name, &c->timer_slack_nsec, message, flags, error); + + if (streq(name, "ProtectSystem")) + return bus_set_transient_protect_system(u, name, &c->protect_system, message, flags, error); + + if (streq(name, "ProtectHome")) + return bus_set_transient_protect_home(u, name, &c->protect_home, message, flags, error); + + if (streq(name, "KeyringMode")) + return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error); + + if (streq(name, "ProtectProc")) + return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error); + + if (streq(name, "ProcSubset")) + return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error); + + if (streq(name, "RuntimeDirectoryPreserve")) + return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error); + + if (streq(name, "UMask")) + return bus_set_transient_mode_t(u, name, &c->umask, message, flags, error); + + if (streq(name, "RuntimeDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_RUNTIME].mode, message, flags, error); + + if (streq(name, "StateDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_STATE].mode, message, flags, error); + + if (streq(name, "CacheDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CACHE].mode, message, flags, error); + + if (streq(name, "LogsDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_LOGS].mode, message, flags, error); + + if (streq(name, "ConfigurationDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CONFIGURATION].mode, message, flags, error); + + if (streq(name, "SELinuxContext")) + return bus_set_transient_string(u, name, &c->selinux_context, message, flags, error); + + if (streq(name, "SecureBits")) + return bus_set_transient_secure_bits(u, name, &c->secure_bits, message, flags, error); + + if (streq(name, "CapabilityBoundingSet")) + return bus_set_transient_capability(u, name, &c->capability_bounding_set, message, flags, error); + + if (streq(name, "AmbientCapabilities")) + return bus_set_transient_capability(u, name, &c->capability_ambient_set, message, flags, error); + + if (streq(name, "RestrictNamespaces")) + return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error); + + if (streq(name, "RestrictFileSystems")) { + int allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + FilesystemParseFlags invert_flag = allow_list ? 0 : FILESYSTEM_PARSE_INVERT; + + if (strv_isempty(l)) { + c->restrict_filesystems_allow_list = false; + c->restrict_filesystems = set_free_free(c->restrict_filesystems); + + unit_write_setting(u, flags, name, "RestrictFileSystems="); + return 1; + } + + if (!c->restrict_filesystems) + c->restrict_filesystems_allow_list = allow_list; + + STRV_FOREACH(s, l) { + r = lsm_bpf_parse_filesystem( + *s, + &c->restrict_filesystems, + FILESYSTEM_PARSE_LOG| + (invert_flag ? FILESYSTEM_PARSE_INVERT : 0)| + (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0), + u->id, NULL, 0); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s%s", name, allow_list ? "" : "~", joined); + } + + return 1; + } + + if (streq(name, "MountFlags")) + return bus_set_transient_mount_propagation_flag(u, name, &c->mount_propagation_flag, message, flags, error); + + if (streq(name, "NetworkNamespacePath")) + return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error); + + if (streq(name, "IPCNamespacePath")) + return bus_set_transient_path(u, name, &c->ipc_namespace_path, message, flags, error); + + if (streq(name, "SupplementaryGroups")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) + if (!isempty(*p) && !valid_user_group_name(*p, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid supplementary group names"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->supplementary_groups = strv_free(c->supplementary_groups); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&c->supplementary_groups, l, true); + if (r < 0) + return -ENOMEM; + + joined = strv_join(c->supplementary_groups, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "SetCredential", "SetCredentialEncrypted")) { + bool isempty = true; + + r = sd_bus_message_enter_container(message, 'a', "(say)"); + if (r < 0) + return r; + + for (;;) { + const char *id; + const void *p; + size_t sz; + + r = sd_bus_message_enter_container(message, 'r', "say"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(message, "s", &id); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!credential_name_valid(id)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id); + + isempty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *a = NULL, *b = NULL; + _cleanup_free_ void *copy = NULL; + ExecSetCredential *old; + + copy = memdup(p, sz); + if (!copy) + return -ENOMEM; + + old = hashmap_get(c->set_credentials, id); + if (old) { + free_and_replace(old->data, copy); + old->size = sz; + old->encrypted = streq(name, "SetCredentialEncrypted"); + } else { + _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL; + + sc = new(ExecSetCredential, 1); + if (!sc) + return -ENOMEM; + + *sc = (ExecSetCredential) { + .id = strdup(id), + .data = TAKE_PTR(copy), + .size = sz, + .encrypted = streq(name, "SetCredentialEncrypted"), + }; + + if (!sc->id) + return -ENOMEM; + + r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc); + if (r < 0) + return r; + + TAKE_PTR(sc); + } + + a = specifier_escape(id); + if (!a) + return -ENOMEM; + + b = cescape_length(p, sz); + if (!b) + return -ENOMEM; + + (void) unit_write_settingf(u, flags, name, "%s=%s:%s", name, a, b); + } + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) { + c->set_credentials = hashmap_free(c->set_credentials); + (void) unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (STR_IN_SET(name, "LoadCredential", "LoadCredentialEncrypted")) { + bool isempty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + for (;;) { + const char *id, *source; + + r = sd_bus_message_read(message, "(ss)", &id, &source); + if (r < 0) + return r; + if (r == 0) + break; + + if (!credential_name_valid(id)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id); + + if (!(path_is_absolute(source) ? path_is_normalized(source) : credential_name_valid(source))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential source is invalid: %s", source); + + isempty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + bool encrypted = streq(name, "LoadCredentialEncrypted"); + + r = hashmap_put_credential(&c->load_credentials, id, source, encrypted); + if (r < 0) + return r; + + (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s:%s", name, id, source); + } + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) { + c->load_credentials = hashmap_free(c->load_credentials); + (void) unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "ImportCredential")) { + bool isempty = true; + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + if (r == 0) + break; + + if (!credential_glob_valid(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential name or glob is invalid: %s", s); + + isempty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = set_put_strdup(&c->import_credentials, s); + if (r < 0) + return r; + + (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, s); + } + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) { + c->import_credentials = set_free_free(c->import_credentials); + (void) unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "SyslogLevel")) { + int32_t level; + + r = sd_bus_message_read(message, "i", &level); + if (r < 0) + return r; + + if (!log_level_is_valid(level)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log level value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->syslog_priority = (c->syslog_priority & LOG_FACMASK) | level; + unit_write_settingf(u, flags, name, "SyslogLevel=%i", level); + } + + return 1; + + } else if (streq(name, "SyslogFacility")) { + int32_t facility; + + r = sd_bus_message_read(message, "i", &facility); + if (r < 0) + return r; + + if (!log_facility_unshifted_is_valid(facility)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log facility value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->syslog_priority = (facility << 3) | LOG_PRI(c->syslog_priority); + unit_write_settingf(u, flags, name, "SyslogFacility=%i", facility); + } + + return 1; + + } else if (streq(name, "LogNamespace")) { + const char *n; + + r = sd_bus_message_read(message, "s", &n); + if (r < 0) + return r; + + if (!isempty(n) && !log_namespace_name_valid(n)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log namespace name not valid"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (isempty(n)) { + c->log_namespace = mfree(c->log_namespace); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + r = free_and_strdup(&c->log_namespace, n); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, n); + } + } + + return 1; + + } else if (streq(name, "LogExtraFields")) { + size_t n = 0; + + r = sd_bus_message_enter_container(message, 'a', "ay"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ void *copy = NULL; + struct iovec *t; + const char *eq; + const void *p; + size_t sz; + + /* Note that we expect a byte array for each field, instead of a string. That's because on the + * lower-level journal fields can actually contain binary data and are not restricted to text, + * and we should not "lose precision" in our types on the way. That said, I am pretty sure + * actually encoding binary data as unit metadata is not a good idea. Hence we actually refuse + * any actual binary data, and only accept UTF-8. This allows us to eventually lift this + * limitation, should a good, valid use case arise. */ + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + if (r == 0) + break; + + if (memchr(p, 0, sz)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains zero byte"); + + eq = memchr(p, '=', sz); + if (!eq) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains no '=' character"); + if (!journal_field_valid(p, eq - (const char*) p, false)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field invalid"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec)); + if (!t) + return -ENOMEM; + c->log_extra_fields = t; + } + + copy = malloc(sz + 1); + if (!copy) + return -ENOMEM; + + memcpy(copy, p, sz); + ((uint8_t*) copy)[sz] = 0; + + if (!utf8_is_valid(copy)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field is not valid UTF-8"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE(copy, sz); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C, name, "LogExtraFields=%s", (char*) copy); + + copy = NULL; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && n == 0) { + exec_context_free_log_extra_fields(c); + unit_write_setting(u, flags, name, "LogExtraFields="); + } + + return 1; + } + +#if HAVE_SECCOMP + + if (streq(name, "SystemCallErrorNumber")) + return bus_set_transient_errno(u, name, &c->syscall_errno, message, flags, error); + + if (streq(name, "SystemCallFilter")) { + int allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT; + + if (strv_isempty(l)) { + c->syscall_allow_list = false; + c->syscall_filter = hashmap_free(c->syscall_filter); + + unit_write_settingf(u, flags, name, "SystemCallFilter="); + return 1; + } + + if (!c->syscall_filter) { + c->syscall_filter = hashmap_new(NULL); + if (!c->syscall_filter) + return log_oom(); + + c->syscall_allow_list = allow_list; + + if (c->syscall_allow_list) { + r = seccomp_parse_syscall_filter("@default", + -1, + c->syscall_filter, + SECCOMP_PARSE_PERMISSIVE | + SECCOMP_PARSE_ALLOW_LIST, + u->id, + NULL, 0); + if (r < 0) + return r; + } + } + + STRV_FOREACH(s, l) { + _cleanup_free_ char *n = NULL; + int e; + + r = parse_syscall_and_errno(*s, &n, &e); + if (r < 0) + return r; + + if (allow_list && e >= 0) + return -EINVAL; + + r = seccomp_parse_syscall_filter(n, + e, + c->syscall_filter, + SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE | + invert_flag | + (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + u->id, + NULL, 0); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "SystemCallFilter=%s%s", allow_list ? "" : "~", joined); + } + + return 1; + + } else if (streq(name, "SystemCallLog")) { + int allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT; + + if (strv_isempty(l)) { + c->syscall_log_allow_list = false; + c->syscall_log = hashmap_free(c->syscall_log); + + unit_write_settingf(u, flags, name, "SystemCallLog="); + return 1; + } + + if (!c->syscall_log) { + c->syscall_log = hashmap_new(NULL); + if (!c->syscall_log) + return log_oom(); + + c->syscall_log_allow_list = allow_list; + } + + STRV_FOREACH(s, l) { + r = seccomp_parse_syscall_filter(*s, + -1, /* errno not used */ + c->syscall_log, + SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE | + invert_flag | + (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + u->id, + NULL, 0); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "SystemCallLog=%s%s", allow_list ? "" : "~", joined); + } + + return 1; + + } else if (streq(name, "SystemCallArchitectures")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + + if (strv_isempty(l)) + c->syscall_archs = set_free(c->syscall_archs); + else + STRV_FOREACH(s, l) { + uint32_t a; + + r = seccomp_arch_from_string(*s, &a); + if (r < 0) + return r; + + r = set_ensure_put(&c->syscall_archs, NULL, UINT32_TO_PTR(a + 1)); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + + return 1; + + } else if (streq(name, "RestrictAddressFamilies")) { + _cleanup_strv_free_ char **l = NULL; + int allow_list; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + + if (strv_isempty(l)) { + c->address_families_allow_list = allow_list; + c->address_families = set_free(c->address_families); + + unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s", + allow_list ? "none" : ""); + return 1; + } + + if (!c->address_families) { + c->address_families = set_new(NULL); + if (!c->address_families) + return log_oom(); + + c->address_families_allow_list = allow_list; + } + + STRV_FOREACH(s, l) { + int af; + + af = af_from_name(*s); + if (af < 0) + return af; + + if (allow_list == c->address_families_allow_list) { + r = set_put(c->address_families, INT_TO_PTR(af)); + if (r < 0) + return r; + } else + set_remove(c->address_families, INT_TO_PTR(af)); + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s%s", allow_list ? "" : "~", joined); + } + + return 1; + } +#endif + if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) { + const void *a; + size_t n; + bool affinity = streq(name, "CPUAffinity"); + _cleanup_(cpu_set_reset) CPUSet set = {}; + + r = sd_bus_message_read_array(message, 'y', &a, &n); + if (r < 0) + return r; + + r = cpu_set_from_dbus(a, n, &set); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (n == 0) { + cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *str = NULL; + + str = cpu_set_to_string(&set); + if (!str) + return -ENOMEM; + + /* We forego any optimizations here, and always create the structure using + * cpu_set_add_all(), because we don't want to care if the existing size we + * got over dbus is appropriate. */ + r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, str); + } + } + + return 1; + + } else if (streq(name, "CPUAffinityFromNUMA")) { + int q; + + r = sd_bus_message_read_basic(message, 'b', &q); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_affinity_from_numa = q; + unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa"); + } + + return 1; + + } else if (streq(name, "NUMAPolicy")) { + int32_t type; + + r = sd_bus_message_read(message, "i", &type); + if (r < 0) + return r; + + if (!mpol_is_valid(type)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + c->numa_policy.type = type; + + return 1; + + } else if (streq(name, "Nice")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!nice_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid Nice value: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->nice = q; + c->nice_set = true; + + unit_write_settingf(u, flags, name, "Nice=%i", q); + } + + return 1; + + } else if (streq(name, "CPUSchedulingPolicy")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!sched_policy_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling policy: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *s = NULL; + + r = sched_policy_to_string_alloc(q, &s); + if (r < 0) + return r; + + c->cpu_sched_policy = q; + c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(q), sched_get_priority_max(q)); + c->cpu_sched_set = true; + + unit_write_settingf(u, flags, name, "CPUSchedulingPolicy=%s", s); + } + + return 1; + + } else if (streq(name, "CPUSchedulingPriority")) { + int32_t p; + + r = sd_bus_message_read(message, "i", &p); + if (r < 0) + return r; + + /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set + * later so we do not check the precise range, but only the generic outer bounds. */ + if (p < 0 || p > 99) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling priority: %i", p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_sched_priority = p; + c->cpu_sched_set = true; + + unit_write_settingf(u, flags, name, "CPUSchedulingPriority=%i", p); + } + + return 1; + + } else if (streq(name, "IOSchedulingClass")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!ioprio_class_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling class: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *s = NULL; + + r = ioprio_class_to_string_alloc(q, &s); + if (r < 0) + return r; + + c->ioprio = ioprio_normalize(ioprio_prio_value(q, ioprio_prio_data(c->ioprio))); + c->ioprio_set = true; + + unit_write_settingf(u, flags, name, "IOSchedulingClass=%s", s); + } + + return 1; + + } else if (streq(name, "IOSchedulingPriority")) { + int32_t p; + + r = sd_bus_message_read(message, "i", &p); + if (r < 0) + return r; + + if (!ioprio_priority_is_valid(p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling priority: %i", p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), p)); + c->ioprio_set = true; + + unit_write_settingf(u, flags, name, "IOSchedulingPriority=%i", p); + } + + return 1; + + } else if (streq(name, "MountAPIVFS")) { + bool b; + + r = bus_set_transient_bool(u, name, &b, message, flags, error); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->mount_apivfs = b; + c->mount_apivfs_set = true; + } + + return 1; + + } else if (streq(name, "WorkingDirectory")) { + const char *s; + bool missing_ok; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (s[0] == '-') { + missing_ok = true; + s++; + } else + missing_ok = false; + + if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (streq(s, "~")) { + c->working_directory = mfree(c->working_directory); + c->working_directory_home = true; + } else { + r = free_and_strdup(&c->working_directory, empty_to_null(s)); + if (r < 0) + return r; + + c->working_directory_home = false; + } + + c->working_directory_missing_ok = missing_ok; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s); + } + + return 1; + + } else if (STR_IN_SET(name, + "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s) && !fdname_is_valid(s)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (streq(name, "StandardInputFileDescriptorName")) { + r = free_and_strdup(c->stdio_fdname + STDIN_FILENO, empty_to_null(s)); + if (r < 0) + return r; + + c->std_input = EXEC_INPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=fd:%s", exec_context_fdname(c, STDIN_FILENO)); + + } else if (streq(name, "StandardOutputFileDescriptorName")) { + r = free_and_strdup(c->stdio_fdname + STDOUT_FILENO, empty_to_null(s)); + if (r < 0) + return r; + + c->std_output = EXEC_OUTPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=fd:%s", exec_context_fdname(c, STDOUT_FILENO)); + + } else { + assert(streq(name, "StandardErrorFileDescriptorName")); + + r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + c->std_error = EXEC_OUTPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=fd:%s", exec_context_fdname(c, STDERR_FILENO)); + } + } + + return 1; + + } else if (STR_IN_SET(name, + "StandardInputFile", + "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate", + "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s)) { + if (!path_is_absolute(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute", s); + if (!path_is_normalized(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", s); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (streq(name, "StandardInputFile")) { + r = free_and_strdup(&c->stdio_file[STDIN_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + c->std_input = EXEC_INPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=file:%s", s); + + } else if (STR_IN_SET(name, "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate")) { + r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + if (streq(name, "StandardOutputFile")) { + c->std_output = EXEC_OUTPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=file:%s", s); + } else if (streq(name, "StandardOutputFileToAppend")) { + c->std_output = EXEC_OUTPUT_FILE_APPEND; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=append:%s", s); + } else { + assert(streq(name, "StandardOutputFileToTruncate")); + c->std_output = EXEC_OUTPUT_FILE_TRUNCATE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=truncate:%s", s); + } + } else { + assert(STR_IN_SET(name, "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate")); + + r = free_and_strdup(&c->stdio_file[STDERR_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + if (streq(name, "StandardErrorFile")) { + c->std_error = EXEC_OUTPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=file:%s", s); + } else if (streq(name, "StandardErrorFileToAppend")) { + c->std_error = EXEC_OUTPUT_FILE_APPEND; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=append:%s", s); + } else { + assert(streq(name, "StandardErrorFileToTruncate")); + c->std_error = EXEC_OUTPUT_FILE_TRUNCATE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=truncate:%s", s); + } + } + } + + return 1; + + } else if (streq(name, "StandardInputData")) { + const void *p; + size_t sz; + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (sz == 0) { + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + + unit_write_settingf(u, flags, name, "StandardInputData="); + } else { + void *q; + ssize_t n; + + if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) + return -E2BIG; + + n = base64mem(p, sz, &encoded); + if (n < 0) + return (int) n; + + q = realloc(c->stdin_data, c->stdin_data_size + sz); + if (!q) + return -ENOMEM; + + memcpy((uint8_t*) q + c->stdin_data_size, p, sz); + + c->stdin_data = q; + c->stdin_data_size += sz; + + unit_write_settingf(u, flags, name, "StandardInputData=%s", encoded); + } + } + + return 1; + + } else if (streq(name, "Environment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_is_valid(l)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment block."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->environment = strv_free(c->environment); + unit_write_setting(u, flags, name, "Environment="); + } else { + _cleanup_free_ char *joined = NULL; + char **e; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C); + if (!joined) + return -ENOMEM; + + e = strv_env_merge(c->environment, l); + if (!e) + return -ENOMEM; + + strv_free_and_replace(c->environment, e); + unit_write_settingf(u, flags, name, "Environment=%s", joined); + } + } + + return 1; + + } else if (streq(name, "UnsetEnvironment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(l)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UnsetEnvironment= list."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->unset_environment = strv_free(c->unset_environment); + unit_write_setting(u, flags, name, "UnsetEnvironment="); + } else { + _cleanup_free_ char *joined = NULL; + char **e; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C); + if (!joined) + return -ENOMEM; + + e = strv_env_merge(c->unset_environment, l); + if (!e) + return -ENOMEM; + + strv_free_and_replace(c->unset_environment, e); + unit_write_settingf(u, flags, name, "UnsetEnvironment=%s", joined); + } + } + + return 1; + + } else if (streq(name, "OOMScoreAdjust")) { + int oa; + + r = sd_bus_message_read(message, "i", &oa); + if (r < 0) + return r; + + if (!oom_score_adjust_is_valid(oa)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "OOM score adjust value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->oom_score_adjust = oa; + c->oom_score_adjust_set = true; + unit_write_settingf(u, flags, name, "OOMScoreAdjust=%i", oa); + } + + return 1; + + } else if (streq(name, "CoredumpFilter")) { + uint64_t f; + + r = sd_bus_message_read(message, "t", &f); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->coredump_filter = f; + c->coredump_filter_set = true; + unit_write_settingf(u, flags, name, "CoredumpFilter=0x%"PRIx64, f); + } + + return 1; + + } else if (streq(name, "EnvironmentFiles")) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *joined = NULL; + _cleanup_strv_free_ char **l = NULL; + FILE *f; + + r = sd_bus_message_enter_container(message, 'a', "(sb)"); + if (r < 0) + return r; + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fputs("EnvironmentFile=\n", f); + + STRV_FOREACH(i, c->environment_files) { + _cleanup_free_ char *q = NULL; + + q = specifier_escape(*i); + if (!q) + return -ENOMEM; + + fprintf(f, "EnvironmentFile=%s\n", q); + } + + while ((r = sd_bus_message_enter_container(message, 'r', "sb")) > 0) { + const char *path; + int b; + + r = sd_bus_message_read(message, "sb", &path, &b); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *q = NULL, *buf = NULL; + + buf = strjoin(b ? "-" : "", path); + if (!buf) + return -ENOMEM; + + q = specifier_escape(buf); + if (!q) + return -ENOMEM; + + fprintf(f, "EnvironmentFile=%s\n", q); + + r = strv_consume(&l, TAKE_PTR(buf)); + if (r < 0) + return r; + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = memstream_finalize(&m, &joined, NULL); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->environment_files = strv_free(c->environment_files); + unit_write_setting(u, flags, name, "EnvironmentFile="); + } else { + r = strv_extend_strv(&c->environment_files, l, true); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, joined); + } + } + + return 1; + + } else if (streq(name, "PassEnvironment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_name_is_valid(l)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PassEnvironment= block."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->pass_environment = strv_free(c->pass_environment); + unit_write_setting(u, flags, name, "PassEnvironment="); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&c->pass_environment, l, true); + if (r < 0) + return r; + + /* We write just the new settings out to file, with unresolved specifiers. */ + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "PassEnvironment=%s", joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "ReadWriteDirectories", "ReadOnlyDirectories", "InaccessibleDirectories", + "ReadWritePaths", "ReadOnlyPaths", "InaccessiblePaths", "ExecPaths", "NoExecPaths", + "ExtensionDirectories")) { + _cleanup_strv_free_ char **l = NULL; + char ***dirs; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + char *i = *p; + size_t offset; + + offset = i[0] == '-'; + offset += i[offset] == '+'; + if (!path_is_absolute(i + offset)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); + + path_simplify(i + offset); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (STR_IN_SET(name, "ReadWriteDirectories", "ReadWritePaths")) + dirs = &c->read_write_paths; + else if (STR_IN_SET(name, "ReadOnlyDirectories", "ReadOnlyPaths")) + dirs = &c->read_only_paths; + else if (streq(name, "ExecPaths")) + dirs = &c->exec_paths; + else if (streq(name, "NoExecPaths")) + dirs = &c->no_exec_paths; + else if (streq(name, "ExtensionDirectories")) + dirs = &c->extension_directories; + else /* "InaccessiblePaths" */ + dirs = &c->inaccessible_paths; + + if (strv_isempty(l)) { + *dirs = strv_free(*dirs); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + r = strv_extend_strv(dirs, l, true); + if (r < 0) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (streq(name, "ExecSearchPath")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) + if (!path_is_absolute(*p) || !path_is_normalized(*p) || strchr(*p, ':')) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->exec_search_path = strv_free(c->exec_search_path); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath="); + } else { + _cleanup_free_ char *joined = NULL; + r = strv_extend_strv(&c->exec_search_path, l, true); + if (r < 0) + return -ENOMEM; + joined = strv_join(c->exec_search_path, ":"); + if (!joined) + return log_oom(); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=%s", joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "RuntimeDirectory", "StateDirectory", "CacheDirectory", "LogsDirectory", "ConfigurationDirectory")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!path_is_normalized(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is not normalized: %s", name, *p); + + if (path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is absolute: %s", name, *p); + + if (path_startswith(*p, "private")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path can't be 'private': %s", name, *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + ExecDirectoryType i; + ExecDirectory *d; + + assert_se((i = exec_directory_type_from_string(name)) >= 0); + d = c->directories + i; + + if (strv_isempty(l)) { + exec_directory_done(d); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + STRV_FOREACH(source, l) { + r = exec_directory_add(d, *source, NULL); + if (r < 0) + return log_oom(); + } + exec_directory_sort(d); + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "AppArmorProfile", "SmackProcessLabel")) { + int ignore; + const char *s; + + r = sd_bus_message_read(message, "(bs)", &ignore, &s); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char **p; + bool *b; + + if (streq(name, "AppArmorProfile")) { + p = &c->apparmor_profile; + b = &c->apparmor_profile_ignore; + } else { /* "SmackProcessLabel" */ + p = &c->smack_process_label; + b = &c->smack_process_label_ignore; + } + + if (isempty(s)) { + *p = mfree(*p); + *b = false; + } else { + if (free_and_strdup(p, s) < 0) + return -ENOMEM; + *b = ignore; + } + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s%s", name, ignore ? "-" : "", strempty(s)); + } + + return 1; + + } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) { + char *source, *destination; + int ignore_enoent; + uint64_t mount_flags; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ssbt)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ssbt)", &source, &destination, &ignore_enoent, &mount_flags)) > 0) { + + if (!path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source); + if (!path_is_absolute(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination); + if (!IN_SET(mount_flags, 0, MS_REC)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mount flags."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = source, + .destination = destination, + .read_only = !!strstr(name, "ReadOnly"), + .recursive = !!(mount_flags & MS_REC), + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s%s:%s:%s", + name, + ignore_enoent ? "-" : "", + source, + destination, + (mount_flags & MS_REC) ? "rbind" : "norbind"); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty) { + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "TemporaryFileSystem")) { + const char *path, *options; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &path, &options)) > 0) { + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Mount point %s is not absolute.", path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s:%s", + name, + path, + options); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty) { + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if ((suffix = startswith(name, "Limit"))) { + const char *soft = NULL; + int ri; + + ri = rlimit_from_string(suffix); + if (ri < 0) { + soft = endswith(suffix, "Soft"); + if (soft) { + const char *n; + + n = strndupa_safe(suffix, soft - suffix); + ri = rlimit_from_string(n); + if (ri >= 0) + name = strjoina("Limit", n); + } + } + + if (ri >= 0) { + uint64_t rl; + rlim_t x; + + r = sd_bus_message_read(message, "t", &rl); + if (r < 0) + return r; + + if (rl == UINT64_MAX) + x = RLIM_INFINITY; + else { + x = (rlim_t) rl; + + if ((uint64_t) x != rl) + return -ERANGE; + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *f = NULL; + struct rlimit nl; + + if (c->rlimit[ri]) { + nl = *c->rlimit[ri]; + + if (soft) + nl.rlim_cur = x; + else + nl.rlim_max = x; + } else + /* When the resource limit is not initialized yet, then assign the value to both fields */ + nl = (struct rlimit) { + .rlim_cur = x, + .rlim_max = x, + }; + + r = rlimit_format(&nl, &f); + if (r < 0) + return r; + + if (c->rlimit[ri]) + *c->rlimit[ri] = nl; + else { + c->rlimit[ri] = newdup(struct rlimit, &nl, 1); + if (!c->rlimit[ri]) + return -ENOMEM; + } + + unit_write_settingf(u, flags, name, "%s=%s", name, f); + } + + return 1; + } + + } else if (streq(name, "MountImages")) { + _cleanup_free_ char *format_str = NULL; + MountImage *mount_images = NULL; + size_t n_mount_images = 0; + char *source, *destination; + int permissive; + + r = sd_bus_message_enter_container(message, 'a', "(ssba(ss))"); + if (r < 0) + return r; + + for (;;) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *source_escaped = NULL, *destination_escaped = NULL; + char *tuple; + + r = sd_bus_message_enter_container(message, 'r', "ssba(ss)"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssb", &source, &destination, &permissive); + if (r <= 0) + break; + + if (!path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source); + if (!path_is_normalized(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source); + if (!path_is_absolute(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination); + if (!path_is_normalized(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination); + + /* Need to store them in the unit with the escapes, so that they can be parsed again */ + source_escaped = shell_escape(source, ":"); + if (!source_escaped) + return -ENOMEM; + destination_escaped = shell_escape(destination, ":"); + if (!destination_escaped) + return -ENOMEM; + + tuple = strjoin(format_str, + format_str ? " " : "", + permissive ? "-" : "", + source_escaped, + ":", + destination_escaped); + if (!tuple) + return -ENOMEM; + free_and_replace(format_str, tuple); + + r = bus_read_mount_options(message, error, &options, &format_str, ":"); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = mount_image_add(&mount_images, &n_mount_images, + &(MountImage) { + .source = source, + .destination = destination, + .mount_options = options, + .ignore_enoent = permissive, + .type = MOUNT_IMAGE_DISCRETE, + }); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (n_mount_images == 0) { + c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images); + + unit_write_settingf(u, flags, name, "%s=", name); + } else { + for (size_t i = 0; i < n_mount_images; ++i) { + r = mount_image_add(&c->mount_images, &c->n_mount_images, &mount_images[i]); + if (r < 0) + return r; + } + + unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS, + name, + "%s=%s", + name, + format_str); + } + } + + mount_images = mount_image_free_many(mount_images, &n_mount_images); + + return 1; + } else if (streq(name, "ExtensionImages")) { + _cleanup_free_ char *format_str = NULL; + MountImage *extension_images = NULL; + size_t n_extension_images = 0; + + r = sd_bus_message_enter_container(message, 'a', "(sba(ss))"); + if (r < 0) + return r; + + for (;;) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *source_escaped = NULL; + char *source, *tuple; + int permissive; + + r = sd_bus_message_enter_container(message, 'r', "sba(ss)"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sb", &source, &permissive); + if (r <= 0) + break; + + if (!path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source); + if (!path_is_normalized(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source); + + /* Need to store them in the unit with the escapes, so that they can be parsed again */ + source_escaped = shell_escape(source, ":"); + if (!source_escaped) + return -ENOMEM; + + tuple = strjoin(format_str, + format_str ? " " : "", + permissive ? "-" : "", + source_escaped); + if (!tuple) + return -ENOMEM; + free_and_replace(format_str, tuple); + + r = bus_read_mount_options(message, error, &options, &format_str, ":"); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = mount_image_add(&extension_images, &n_extension_images, + &(MountImage) { + .source = source, + .mount_options = options, + .ignore_enoent = permissive, + .type = MOUNT_IMAGE_EXTENSION, + }); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (n_extension_images == 0) { + c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images); + + unit_write_settingf(u, flags, name, "%s=", name); + } else { + for (size_t i = 0; i < n_extension_images; ++i) { + r = mount_image_add(&c->extension_images, &c->n_extension_images, &extension_images[i]); + if (r < 0) + return r; + } + + unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS, + name, + "%s=%s", + name, + format_str); + } + } + + extension_images = mount_image_free_many(extension_images, &n_extension_images); + + return 1; + + } else if (STR_IN_SET(name, "StateDirectorySymlink", "RuntimeDirectorySymlink", "CacheDirectorySymlink", "LogsDirectorySymlink")) { + char *source, *destination; + ExecDirectory *directory; + uint64_t symlink_flags; /* No flags for now, reserved for future uses. */ + ExecDirectoryType i; + + assert_se((i = exec_directory_type_symlink_from_string(name)) >= 0); + directory = c->directories + i; + + r = sd_bus_message_enter_container(message, 'a', "(sst)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(sst)", &source, &destination, &symlink_flags)) > 0) { + if (!path_is_valid(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not valid.", source); + if (path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is absolute.", source); + if (!path_is_normalized(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source); + if (!path_is_valid(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not valid.", destination); + if (path_is_absolute(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is absolute.", destination); + if (!path_is_normalized(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination); + if (symlink_flags != 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *destination_escaped = NULL, *source_escaped = NULL; + + r = exec_directory_add(directory, source, destination); + if (r < 0) + return r; + + /* Need to store them in the unit with the escapes, so that they can be parsed again */ + source_escaped = xescape(source, ":"); + destination_escaped = xescape(destination, ":"); + if (!source_escaped || !destination_escaped) + return -ENOMEM; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, exec_directory_type_to_string(i), + "%s=%s:%s", + exec_directory_type_to_string(i), + source_escaped, + destination_escaped); + } + } + if (r < 0) + return r; + + exec_directory_sort(directory); + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 1; + + } else if (STR_IN_SET(name, "RootImagePolicy", "MountImagePolicy", "ExtensionImagePolicy")) { + _cleanup_(image_policy_freep) ImagePolicy *p = NULL; + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + r = image_policy_from_string(s, &p); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to parse image policy string: %s", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *t = NULL; + ImagePolicy **pp = + streq(name, "RootImagePolicy") ? &c->root_image_policy : + streq(name, "MountImagePolicy") ? &c->mount_image_policy : + &c->extension_image_policy; + + r = image_policy_to_string(p, /* simplify= */ true, &t); + if (r < 0) + return r; + + image_policy_free(*pp); + *pp = TAKE_PTR(p); + + unit_write_settingf( + u, flags, name, + "%s=%s", + name, + t); /* no escaping necessary */ + } + + return 1; + } + + return 0; +} diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h new file mode 100644 index 0000000..5926bdb --- /dev/null +++ b/src/core/dbus-execute.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "execute.h" + +#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \ + SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \ + SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \ + SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags) + +#define BUS_EXEC_COMMAND_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command, offset, flags) + +#define BUS_EXEC_COMMAND_LIST_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command_list, offset, flags) + +#define BUS_EXEC_EX_COMMAND_LIST_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasasttttuii)", bus_property_get_exec_ex_command_list, offset, flags) + +extern const sd_bus_vtable bus_exec_vtable[]; + +int bus_property_get_exec_output(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_command(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_ex_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_preserve_mode(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); + +int bus_exec_context_set_transient_property(Unit *u, ExecContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_exec_command(Unit *u, const char *name, ExecCommand **exec_command, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_exec_preserve_mode(Unit *u, const char *name, ExecPreserveMode *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c new file mode 100644 index 0000000..c88d8c2 --- /dev/null +++ b/src/core/dbus-job.c @@ -0,0 +1,374 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-util.h" +#include "dbus-job.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "job.h" +#include "log.h" +#include "selinux-access.h" +#include "string-util.h" +#include "strv.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, job_type, JobType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_state, job_state, JobState); + +static int property_get_unit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Job *j = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + p = unit_dbus_path(j->unit); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(so)", j->unit->id, p); +} + +int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Job *j = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_unit_access_check(j->unit, message, "stop", error); + if (r < 0) + return r; + + /* Access is granted to the job owner */ + if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) { + + /* And for everybody else consult polkit */ + r = bus_verify_manage_units_async(j->unit->manager, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + } + + job_finish_and_invalidate(j, JOB_CANCELED, true, false); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ Job **list = NULL; + Job *j = userdata; + int r, n; + + if (strstr(sd_bus_message_get_member(message), "After")) + n = job_get_after(j, &list); + else + n = job_get_before(j, &list); + if (n < 0) + return n; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(usssoo)"); + if (r < 0) + return r; + + for (int i = 0; i < n; i ++) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + + job_path = job_dbus_path(list[i]); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(list[i]->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(usssoo)", + list[i]->id, + list[i]->unit->id, + job_type_to_string(list[i]->type), + job_state_to_string(list[i]->state), + job_path, + unit_path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +const sd_bus_vtable bus_job_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_METHOD("Cancel", NULL, NULL, bus_job_method_cancel, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetAfter", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(usssoo)", jobs), + bus_job_method_get_waiting_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetBefore", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(usssoo)", jobs), + bus_job_method_get_waiting_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Job, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Unit", "(so)", property_get_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobType", "s", property_get_type, offsetof(Job, type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("State", "s", property_get_state, offsetof(Job, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Job, activation_details), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int bus_job_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Job *j; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = manager_get_job_from_dbus_path(m, path, &j); + if (r < 0) + return 0; + + *found = j; + return 1; +} + +static int bus_job_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + unsigned k = 0; + Job *j; + + l = new0(char*, hashmap_size(m->jobs)+1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(j, m->jobs) { + l[k] = job_dbus_path(j); + if (!l[k]) + return -ENOMEM; + + k++; + } + + assert(hashmap_size(m->jobs) == k); + + *nodes = TAKE_PTR(l); + + return k; +} + +const BusObjectImplementation job_object = { + "/org/freedesktop/systemd1/job", + "org.freedesktop.systemd1.Job", + .fallback_vtables = BUS_FALLBACK_VTABLES({bus_job_vtable, bus_job_find}), + .node_enumerator = bus_job_enumerate, +}; + +static int send_new_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Job *j = ASSERT_PTR(userdata); + int r; + + assert(bus); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobNew"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "uos", j->id, p, j->unit->id); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + _cleanup_free_ char *p = NULL; + Job *j = ASSERT_PTR(userdata); + + assert(bus); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + return sd_bus_emit_properties_changed(bus, p, "org.freedesktop.systemd1.Job", "State", NULL); +} + +void bus_job_send_change_signal(Job *j) { + int r; + + assert(j); + + /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */ + bus_unit_send_pending_change_signal(j->unit, true); + + if (j->in_dbus_queue) { + LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + + /* The job might be good to be GC once its pending signals have been sent */ + job_add_to_gc_queue(j); + } + + r = bus_foreach_bus(j->manager, j->bus_track, j->sent_dbus_new_signal ? send_changed_signal : send_new_signal, j); + if (r < 0) + log_debug_errno(r, "Failed to send job change signal for %u: %m", j->id); + + j->sent_dbus_new_signal = true; +} + +void bus_job_send_pending_change_signal(Job *j, bool including_new) { + assert(j); + + if (!j->in_dbus_queue) + return; + + if (!j->sent_dbus_new_signal && !including_new) + return; + + if (MANAGER_IS_RELOADING(j->unit->manager)) + return; + + bus_job_send_change_signal(j); +} + +static int send_removed_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Job *j = ASSERT_PTR(userdata); + int r; + + assert(bus); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "uoss", j->id, p, j->unit->id, job_result_to_string(j->result)); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +void bus_job_send_removed_signal(Job *j) { + int r; + + assert(j); + + if (!j->sent_dbus_new_signal) + bus_job_send_change_signal(j); + + /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */ + bus_unit_send_pending_change_signal(j->unit, true); + + r = bus_foreach_bus(j->manager, j->bus_track, send_removed_signal, j); + if (r < 0) + log_debug_errno(r, "Failed to send job remove signal for %u: %m", j->id); +} + +static int bus_job_track_handler(sd_bus_track *t, void *userdata) { + Job *j = ASSERT_PTR(userdata); + + assert(t); + + j->bus_track = sd_bus_track_unref(j->bus_track); /* make sure we aren't called again */ + + /* Last client dropped off the bus, maybe we should GC this now? */ + job_add_to_gc_queue(j); + return 0; +} + +static int bus_job_allocate_bus_track(Job *j) { + + assert(j); + + if (j->bus_track) + return 0; + + return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j); +} + +int bus_job_coldplug_bus_track(Job *j) { + int r; + _cleanup_strv_free_ char **deserialized_clients = NULL; + + assert(j); + + deserialized_clients = TAKE_PTR(j->deserialized_clients); + + if (strv_isempty(deserialized_clients)) + return 0; + + if (!j->manager->api_bus) + return 0; + + r = bus_job_allocate_bus_track(j); + if (r < 0) + return r; + + return bus_track_add_name_many(j->bus_track, deserialized_clients); +} + +int bus_job_track_sender(Job *j, sd_bus_message *m) { + int r; + + assert(j); + assert(m); + + if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) { + j->ref_by_private_bus = true; + return 0; + } + + r = bus_job_allocate_bus_track(j); + if (r < 0) + return r; + + return sd_bus_track_add_sender(j->bus_track, m); +} diff --git a/src/core/dbus-job.h b/src/core/dbus-job.h new file mode 100644 index 0000000..6f00581 --- /dev/null +++ b/src/core/dbus-job.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "unit.h" +#include "bus-object.h" + +extern const sd_bus_vtable bus_job_vtable[]; +extern const BusObjectImplementation job_object; + +int bus_job_method_cancel(sd_bus_message *message, void *job, sd_bus_error *error); +int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error); + +void bus_job_send_change_signal(Job *j); +void bus_job_send_pending_change_signal(Job *j, bool including_new); +void bus_job_send_removed_signal(Job *j); + +int bus_job_coldplug_bus_track(Job *j); +int bus_job_track_sender(Job *j, sd_bus_message *m); diff --git a/src/core/dbus-kill.c b/src/core/dbus-kill.c new file mode 100644 index 0000000..19e439f --- /dev/null +++ b/src/core/dbus-kill.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-get-properties.h" +#include "dbus-kill.h" +#include "dbus-util.h" +#include "kill.h" +#include "signal-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_kill_mode, kill_mode, KillMode); + +static int property_get_restart_kill_signal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + KillContext *c = ASSERT_PTR(userdata); + int s; + + s = restart_kill_signal(c); + return sd_bus_message_append_basic(reply, 'i', &s); +} + +const sd_bus_vtable bus_kill_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("KillMode", "s", property_get_kill_mode, offsetof(KillContext, kill_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KillSignal", "i", bus_property_get_int, offsetof(KillContext, kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartKillSignal", "i", property_get_restart_kill_signal, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FinalKillSignal", "i", bus_property_get_int, offsetof(KillContext, final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendSIGKILL", "b", bus_property_get_bool, offsetof(KillContext, send_sigkill), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendSIGHUP", "b", bus_property_get_bool, offsetof(KillContext, send_sighup), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogSignal", "i", bus_property_get_int, offsetof(KillContext, watchdog_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static BUS_DEFINE_SET_TRANSIENT_PARSE(kill_mode, KillMode, kill_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(restart_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(final_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(watchdog_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); + +int bus_kill_context_set_transient_property( + Unit *u, + KillContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "KillMode")) + return bus_set_transient_kill_mode(u, name, &c->kill_mode, message, flags, error); + + if (streq(name, "SendSIGHUP")) + return bus_set_transient_bool(u, name, &c->send_sighup, message, flags, error); + + if (streq(name, "SendSIGKILL")) + return bus_set_transient_bool(u, name, &c->send_sigkill, message, flags, error); + + if (streq(name, "KillSignal")) + return bus_set_transient_kill_signal(u, name, &c->kill_signal, message, flags, error); + + if (streq(name, "RestartKillSignal")) + return bus_set_transient_restart_kill_signal(u, name, &c->restart_kill_signal, message, flags, error); + + if (streq(name, "FinalKillSignal")) + return bus_set_transient_final_kill_signal(u, name, &c->final_kill_signal, message, flags, error); + + if (streq(name, "WatchdogSignal")) + return bus_set_transient_watchdog_signal(u, name, &c->watchdog_signal, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-kill.h b/src/core/dbus-kill.h new file mode 100644 index 0000000..5a90287 --- /dev/null +++ b/src/core/dbus-kill.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "kill.h" +#include "unit.h" + +extern const sd_bus_vtable bus_kill_vtable[]; + +int bus_kill_context_set_transient_property(Unit *u, KillContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c new file mode 100644 index 0000000..745f5cc --- /dev/null +++ b/src/core/dbus-manager.c @@ -0,0 +1,3628 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "architecture.h" +#include "build.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-log-control-api.h" +#include "chase.h" +#include "confidential-virt.h" +#include "data-fd-util.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-scope.h" +#include "dbus-service.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "initrd-util.h" +#include "install.h" +#include "log.h" +#include "manager-dump.h" +#include "os-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-access.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "user-util.h" +#include "version.h" +#include "virt.h" +#include "watchdog.h" + +/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state + * there, and if we can't we'll fail badly. */ +#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024)) + +static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) { + return (runtime ? UNIT_FILE_RUNTIME : 0) | + (force ? UNIT_FILE_FORCE : 0); +} + +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy); +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_emergency_action, emergency_action, EmergencyAction); + +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", systemd_features); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture())); +static BUS_DEFINE_PROPERTY_GET2(property_get_system_state, "s", Manager, manager_state, manager_state_to_string); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_timer_slack_nsec, "t", (uint64_t) prctl(PR_GET_TIMERSLACK)); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "u", Hashmap *, hashmap_size); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_set_size, "u", Set *, set_size); +static BUS_DEFINE_PROPERTY_GET(property_get_default_timeout_abort_usec, "t", Manager, manager_default_timeout_abort_usec); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_device, "s", watchdog_get_device()); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_realtime, "t", watchdog_get_last_ping(CLOCK_REALTIME)); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_monotonic, "t", watchdog_get_last_ping(CLOCK_MONOTONIC)); +static BUS_DEFINE_PROPERTY_GET(property_get_progress, "d", Manager, manager_get_progress); + +static int property_get_virtualization( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Virtualization v; + + assert(bus); + assert(reply); + + v = detect_virtualization(); + + /* Make sure to return the empty string when we detect no virtualization, as that is the API. + * + * https://github.com/systemd/systemd/issues/1423 + */ + + return sd_bus_message_append( + reply, "s", + v == VIRTUALIZATION_NONE ? NULL : virtualization_to_string(v)); +} + +static int property_get_confidential_virtualization( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ConfidentialVirtualization v; + + assert(bus); + assert(reply); + + v = detect_confidential_virtualization(); + + return sd_bus_message_append( + reply, "s", + v <= 0 ? NULL : confidential_virtualization_to_string(v)); +} + +static int property_get_tainted( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *s = NULL; + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + s = manager_taint_string(m); + if (!s) + return log_oom(); + + return sd_bus_message_append(reply, "s", s); +} + +static int property_set_log_target( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + manager_restore_original_log_target(m); + else { + LogTarget target; + + target = log_target_from_string(t); + if (target < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t); + + manager_override_log_target(m, target); + } + + return 0; +} + +static int property_set_log_level( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + manager_restore_original_log_level(m); + else { + int level; + + level = log_level_from_string(t); + if (level < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t); + + manager_override_log_level(m, level); + } + + return 0; +} + +static int property_get_environment( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = manager_get_effective_environment(m, &l); + if (r < 0) + return r; + + return sd_bus_message_append_strv(reply, l); +} + +static int property_get_show_status( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", manager_get_show_status_on(m)); +} + +static int property_get_runtime_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_RUNTIME)); +} + +static int property_get_pretimeout_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_PRETIMEOUT)); +} + +static int property_get_pretimeout_watchdog_governor( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "s", m->watchdog_pretimeout_governor); +} + +static int property_get_reboot_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_REBOOT)); +} + +static int property_get_kexec_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_KEXEC)); +} + +static int property_set_watchdog(Manager *m, WatchdogType type, sd_bus_message *value) { + usec_t timeout; + int r; + + assert(m); + assert(value); + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + r = sd_bus_message_read(value, "t", &timeout); + if (r < 0) + return r; + + manager_override_watchdog(m, type, timeout); + return 0; +} + +static int property_set_runtime_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + return property_set_watchdog(userdata, WATCHDOG_RUNTIME, value); +} + +static int property_set_pretimeout_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + return property_set_watchdog(userdata, WATCHDOG_PRETIMEOUT, value); +} + +static int property_set_pretimeout_watchdog_governor( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + char *governor; + int r; + + r = sd_bus_message_read(value, "s", &governor); + if (r < 0) + return r; + if (!string_is_safe(governor)) + return -EINVAL; + + return manager_override_watchdog_pretimeout_governor(m, governor); +} + +static int property_set_reboot_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + return property_set_watchdog(userdata, WATCHDOG_REBOOT, value); +} + +static int property_set_kexec_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + _unused_ Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(value); + + return property_set_watchdog(userdata, WATCHDOG_KEXEC, value); +} + +static int property_get_oom_score_adjust( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + int r, n; + + assert(bus); + assert(reply); + + if (m->defaults.oom_score_adjust_set) + n = m->defaults.oom_score_adjust; + else { + n = 0; + r = get_oom_score_adjust(&n); + if (r < 0) + log_debug_errno(r, "Failed to read current OOM score adjustment value, ignoring: %m"); + } + + return sd_bus_message_append(reply, "i", n); +} + +static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) { + Unit *u; + int r; + + assert(m); + assert(message); + assert(ret_unit); + + /* More or less a wrapper around manager_get_unit() that generates nice errors and has one trick up + * its sleeve: if the name is specified empty we use the client's unit. */ + + if (isempty(name)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit."); + } else { + u = manager_get_unit(m, name); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", name); + } + + *ret_unit = u; + return 0; +} + +static int bus_load_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) { + assert(m); + assert(message); + assert(ret_unit); + + /* Pretty much the same as bus_get_unit_by_name(), but we also load the unit if necessary. */ + + if (isempty(name)) + return bus_get_unit_by_name(m, message, name, ret_unit, error); + + return manager_load_unit(m, name, NULL, error, ret_unit); +} + +static int reply_unit_path(Unit *u, sd_bus_message *message, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + int r; + + assert(u); + assert(message); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return log_oom(); + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + Unit *u; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return reply_unit_path(u, message, error); +} + +static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + pid_t pid; + Unit *u; + int r; + + assert(message); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "u", &pid); + if (r < 0) + return r; + if (pid < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid); + + if (pid == 0) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid); + + return reply_unit_path(u, message, error); +} + +static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = ASSERT_PTR(userdata); + sd_id128_t id; + const void *a; + Unit *u; + size_t sz; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = sd_bus_message_read_array(message, 'y', &a, &sz); + if (r < 0) + return r; + if (sz == 0) + id = SD_ID128_NULL; + else if (sz == 16) + memcpy(&id, a, sz); + else + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID"); + + if (sd_id128_is_null(id)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, + "Client " PID_FMT " not member of any unit.", pid); + } else { + u = hashmap_get(m->units_by_invocation_id, &id); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id)); + } + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + /* So here's a special trick: the bus path we return actually references the unit by its invocation + * ID instead of the unit name. This means it stays valid only as long as the invocation ID stays the + * same. */ + path = unit_dbus_path_invocation_id(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit_by_control_group(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *cgroup; + Unit *u; + int r; + + r = sd_bus_message_read(message, "s", &cgroup); + if (r < 0) + return r; + + u = manager_get_unit_by_cgroup(m, cgroup); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, + "Control group '%s' is not valid or not managed by this instance", + cgroup); + + return reply_unit_path(u, message, error); +} + +static int method_get_unit_by_pidfd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + Manager *m = ASSERT_PTR(userdata); + _cleanup_free_ char *path = NULL; + int r, pidfd; + Unit *u; + + assert(message); + + r = sd_bus_message_read(message, "h", &pidfd); + if (r < 0) + return r; + + r = pidref_set_pidfd(&pidref, pidfd); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to get PID from PIDFD: %m"); + + u = manager_get_unit_by_pidref(m, &pidref); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pidref.pid); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return log_oom(); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "os", path, u->id); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', u->invocation_id.bytes, sizeof(u->invocation_id.bytes)); + if (r < 0) + return r; + + /* Double-check that the process is still alive and that the PID did not change before returning the + * answer. */ + r = pidref_verify(&pidref); + if (r == -ESRCH) + return sd_bus_error_setf(error, + BUS_ERROR_NO_SUCH_PROCESS, + "The PIDFD's PID "PID_FMT" changed during the lookup operation.", + pidref.pid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to get PID from PIDFD: %m"); + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + Unit *u; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_load_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return reply_unit_path(u, message, error); +} + +static int method_start_unit_generic(sd_bus_message *message, Manager *m, JobType job_type, bool reload_if_possible, sd_bus_error *error) { + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_load_unit(m, name, NULL, error, &u); + if (r < 0) + return r; + + return bus_unit_method_start_generic(message, u, job_type, reload_if_possible, error); +} + +static int method_start_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_START, /* reload_if_possible = */ false, error); +} + +static int method_stop_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_STOP, /* reload_if_possible = */ false, error); +} + +static int method_reload_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RELOAD, /* reload_if_possible = */ false, error); +} + +static int method_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RESTART, /* reload_if_possible = */ false, error); +} + +static int method_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, /* reload_if_possible = */ false, error); +} + +static int method_reload_or_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RESTART, /* reload_if_possible = */ true, error); +} + +static int method_reload_or_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, /* reload_if_possible = */ true, error); +} + +typedef enum GenericUnitOperationFlags { + GENERIC_UNIT_LOAD = 1 << 0, /* Load if the unit is not loaded yet */ + GENERIC_UNIT_VALIDATE_LOADED = 1 << 1, /* Verify unit is properly loaded before forwarding call */ +} GenericUnitOperationFlags; + +static int method_generic_unit_operation( + sd_bus_message *message, + Manager *m, + sd_bus_error *error, + sd_bus_message_handler_t handler, + GenericUnitOperationFlags flags) { + + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Read the first argument from the command and pass the operation to the specified per-unit + * method. */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (!isempty(name) && FLAGS_SET(flags, GENERIC_UNIT_LOAD)) + r = manager_load_unit(m, name, NULL, error, &u); + else + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + if (FLAGS_SET(flags, GENERIC_UNIT_VALIDATE_LOADED)) { + r = bus_unit_validate_load_state(u, error); + if (r < 0) + return r; + } + + return handler(message, u, error); +} + +static int method_enqueue_unit_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* We don't bother with GENERIC_UNIT_VALIDATE_LOADED here, as the job logic validates that anyway */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_enqueue_job, GENERIC_UNIT_LOAD); +} + +static int method_start_unit_replace(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *old_name; + Unit *u; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &old_name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, old_name, &u, error); + if (r < 0) + return r; + if (!u->job || u->job->type != JOB_START) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "No job queued for unit %s", old_name); + + return method_start_unit_generic(message, m, JOB_START, /* reload_if_possible = */ false, error); +} + +static int method_kill_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* We don't bother with GENERIC_UNIT_LOAD nor GENERIC_UNIT_VALIDATE_LOADED here, as it shouldn't + * matter whether a unit is loaded for killing any processes possibly in the unit's cgroup. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_kill, 0); +} + +static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Load the unit if necessary, in order to load it, and insist on the unit being loaded to be + * cleaned */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_clean, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0); +} + +static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0); +} + +static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't load the unit (because unloaded units can't be in failed state), and don't insist on the + * unit to be loaded properly (since a failed unit might have its unit file disappeared) */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_reset_failed, 0); +} + +static int method_set_unit_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only change properties on fully loaded units, and load them in order to set properties */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_set_properties, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_bind_mount_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only add mounts on fully loaded units */ + return method_generic_unit_operation(message, userdata, error, bus_service_method_bind_mount, GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_mount_image_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only add mounts on fully loaded units */ + return method_generic_unit_operation(message, userdata, error, bus_service_method_mount_image, GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only allow reffing of fully loaded units, and make sure reffing a unit loads it. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_ref, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_unref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Dropping a ref OTOH should not require the unit to still be loaded. And since a reffed unit is a + * loaded unit there's no need to load the unit for unreffing it. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_unref, 0); +} + +static int reply_unit_info(sd_bus_message *reply, Unit *u) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + Unit *following; + + following = unit_following(u); + + unit_path = unit_dbus_path(u); + if (!unit_path) + return -ENOMEM; + + if (u->job) { + job_path = job_dbus_path(u->job); + if (!job_path) + return -ENOMEM; + } + + return sd_bus_message_append( + reply, "(ssssssouso)", + u->id, + unit_description(u), + unit_load_state_to_string(u->load_state), + unit_active_state_to_string(unit_active_state(u)), + unit_sub_state_to_string(u), + following ? following->id : "", + unit_path, + u->job ? u->job->id : 0, + u->job ? job_type_to_string(u->job->type) : "", + empty_to_root(job_path)); +} + +static int method_list_units_by_names(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + _cleanup_strv_free_ char **units = NULL; + + assert(message); + + r = sd_bus_message_read_strv(message, &units); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)"); + if (r < 0) + return r; + + STRV_FOREACH(unit, units) { + Unit *u; + + if (!unit_name_is_valid(*unit, UNIT_NAME_ANY)) + continue; + + r = bus_load_unit_by_name(m, message, *unit, &u, error); + if (r < 0) + return r; + + r = reply_unit_info(reply, u); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the + * unit being loaded (because even improperly loaded units might still have processes around */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0); +} + +static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't allow attaching new processes to units that aren't loaded. Don't bother with loading a unit + * for this purpose though, as an unloaded unit is a stopped unit, and we don't allow attaching + * processes to stopped units anyway. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED); +} + +static int transient_unit_from_message( + Manager *m, + sd_bus_message *message, + const char *name, + Unit **unit, + sd_bus_error *error) { + + UnitType t; + Unit *u; + int r; + + assert(m); + assert(message); + assert(name); + + t = unit_name_to_type(name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid unit name or type."); + + if (!unit_vtable[t]->can_transient) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Unit type %s does not support transient units.", + unit_type_to_string(t)); + + r = manager_load_unit(m, name, NULL, error, &u); + if (r < 0) + return r; + + if (!unit_is_pristine(u)) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "Unit %s was already loaded or has a fragment file.", name); + + /* OK, the unit failed to load and is unreferenced, now let's + * fill in the transient data instead */ + r = unit_make_transient(u); + if (r < 0) + return r; + + /* Set our properties */ + r = bus_unit_set_properties(u, message, UNIT_RUNTIME, false, error); + if (r < 0) + return r; + + /* If the client asked for it, automatically add a reference to this unit. */ + if (u->bus_track_add) { + r = bus_unit_track_add_sender(u, message); + if (r < 0) + return log_error_errno(r, "Failed to watch sender: %m"); + } + + /* Now load the missing bits of the unit we just created */ + unit_add_to_load_queue(u); + manager_dispatch_load_queue(m); + + *unit = u; + + return 0; +} + +static int transient_aux_units_from_message( + Manager *m, + sd_bus_message *message, + sd_bus_error *error) { + + int r; + + assert(m); + assert(message); + + r = sd_bus_message_enter_container(message, 'a', "(sa(sv))"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, 'r', "sa(sv)")) > 0) { + const char *name = NULL; + Unit *u; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = transient_unit_from_message(m, message, name, &u, error); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 0; +} + +static int method_start_transient_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *name, *smode; + Manager *m = ASSERT_PTR(userdata); + JobMode mode; + Unit *u; + int r; + + assert(message); + + r = mac_selinux_access_check(message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ss", &name, &smode); + if (r < 0) + return r; + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s is invalid.", smode); + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = transient_unit_from_message(m, message, name, &u, error); + if (r < 0) + return r; + + r = transient_aux_units_from_message(m, message, error); + if (r < 0) + return r; + + /* Finally, start it */ + return bus_unit_queue_job(message, u, JOB_START, mode, 0, error); +} + +static int method_get_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = ASSERT_PTR(userdata); + uint32_t id; + Job *j; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + r = mac_selinux_unit_access_check(j->unit, message, "status", error); + if (r < 0) + return r; + + path = job_dbus_path(j); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_cancel_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + uint32_t id; + Job *j; + int r; + + assert(message); + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + return bus_job_method_cancel(message, j, error); +} + +static int method_clear_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + manager_clear_jobs(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + manager_reset_failed(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *k; + Unit *u; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(u, k, m->units) { + if (k != u->id) + continue; + + if (!strv_isempty(states) && + !strv_contains(states, unit_load_state_to_string(u->load_state)) && + !strv_contains(states, unit_active_state_to_string(unit_active_state(u))) && + !strv_contains(states, unit_sub_state_to_string(u))) + continue; + + if (!strv_isempty(patterns) && + !strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE)) + continue; + + r = reply_unit_info(reply, u); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_units(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return list_units_filtered(message, userdata, error, NULL, NULL); +} + +static int method_list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + return list_units_filtered(message, userdata, error, states, NULL); +} + +static int method_list_units_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return list_units_filtered(message, userdata, error, states, patterns); +} + +static int method_list_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + Job *j; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(usssoo)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(j, m->jobs) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + + job_path = job_dbus_path(j); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(j->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append( + reply, "(usssoo)", + j->id, + j->unit->id, + job_type_to_string(j->type), + job_state_to_string(j->state), + job_path, + unit_path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_subscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + if (sd_bus_message_get_bus(message) == m->api_bus) { + + /* Note that direct bus connection subscribe by + * default, we only track peers on the API bus here */ + + if (!m->subscribed) { + r = sd_bus_track_new(sd_bus_message_get_bus(message), &m->subscribed, NULL, NULL); + if (r < 0) + return r; + } + + r = sd_bus_track_add_sender(m->subscribed, message); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_set(error, BUS_ERROR_ALREADY_SUBSCRIBED, "Client is already subscribed."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unsubscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + if (sd_bus_message_get_bus(message) == m->api_bus) { + r = sd_bus_track_remove_sender(m->subscribed, message); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_set(error, BUS_ERROR_NOT_SUBSCRIBED, "Client is not subscribed."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int dump_impl( + sd_bus_message *message, + void *userdata, + sd_bus_error *error, + char **patterns, + int (*reply)(sd_bus_message *, char *)) { + + _cleanup_free_ char *dump = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + /* 'status' access is the bare minimum always needed for this, as the policy might straight out + * forbid a client from querying any information from systemd, regardless of any rate limiting. */ + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + /* Rate limit reached? Check if the caller is privileged/allowed by policy to bypass this. We + * check the rate limit first to avoid the expensive roundtrip to polkit when not needed. */ + if (!ratelimit_below(&m->dump_ratelimit)) { + /* We need a way for SELinux to constrain the operation when the rate limit is active, even + * if polkit would allow it, but we cannot easily add new named permissions, so we need to + * use an existing one. Reload/reexec are also slow but non-destructive/modifying + * operations, and can cause PID1 to stall. So it seems similar enough in terms of security + * considerations and impact, and thus use the same access check for dumps which, given the + * large amount of data to fetch, can stall PID1 for quite some time. */ + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + goto ratelimited; + + r = bus_verify_bypass_dump_ratelimit_async(m, message, error); + if (r < 0) + goto ratelimited; + if (r == 0) + /* No authorization for now, but the async polkit stuff will call us again when it + * has it */ + return 1; + } + + r = manager_get_dump_string(m, patterns, &dump); + if (r < 0) + return r; + + return reply(message, dump); + +ratelimited: + log_warning("Dump request rejected due to rate limit on unprivileged callers, blocked for %s.", + FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC)); + return sd_bus_error_setf(error, + SD_BUS_ERROR_LIMITS_EXCEEDED, + "Dump request rejected due to rate limit on unprivileged callers, blocked for %s.", + FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC)); +} + +static int reply_dump(sd_bus_message *message, char *dump) { + return sd_bus_reply_method_return(message, "s", dump); +} + +static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_impl(message, userdata, error, NULL, reply_dump); +} + +static int reply_dump_by_fd(sd_bus_message *message, char *dump) { + _cleanup_close_ int fd = -EBADF; + + fd = acquire_data_fd(dump, strlen(dump), 0); + if (fd < 0) + return fd; + + return sd_bus_reply_method_return(message, "h", fd); +} + +static int method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_impl(message, userdata, error, NULL, reply_dump_by_fd); +} + +static int dump_units_matching_patterns( + sd_bus_message *message, + void *userdata, + sd_bus_error *error, + int (*reply)(sd_bus_message *, char *)) { + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return dump_impl(message, userdata, error, patterns, reply); +} + +static int method_dump_units_matching_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_units_matching_patterns(message, userdata, error, reply_dump); +} + +static int method_dump_units_matching_patterns_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_units_matching_patterns(message, userdata, error, reply_dump_by_fd); +} + +static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed."); +} + +static int get_run_space(uint64_t *ret, sd_bus_error *error) { + struct statvfs svfs; + + assert(ret); + + if (statvfs("/run/systemd", &svfs) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m"); + + *ret = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize; + return 0; +} + +static int verify_run_space(const char *message, sd_bus_error *error) { + uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */ + int r; + + assert(message); + + r = get_run_space(&available, error); + if (r < 0) + return r; + + if (available < RELOAD_DISK_SPACE_MIN) + return sd_bus_error_setf(error, + BUS_ERROR_DISK_FULL, + "%s, not enough space available on /run/systemd/. " + "Currently, %s are free, but a safety buffer of %s is enforced.", + message, + FORMAT_BYTES(available), + FORMAT_BYTES(RELOAD_DISK_SPACE_MIN)); + + return 0; +} + +int verify_run_space_and_log(const char *message) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(message); + + r = verify_run_space(message, &error); + if (r < 0) + return log_error_errno(r, "%s", bus_error_message(&error, r)); + + return 0; +} + +static int verify_run_space_permissive(const char *message, sd_bus_error *error) { + uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */ + int r; + + assert(message); + + r = get_run_space(&available, error); + if (r < 0) + return r; + + if (available < RELOAD_DISK_SPACE_MIN) + log_warning("Dangerously low amount of free space on /run/systemd/, %s.\n" + "Currently, %s are free, but %s are suggested. Proceeding anyway.", + message, + FORMAT_BYTES(available), + FORMAT_BYTES(RELOAD_DISK_SPACE_MIN)); + + return 0; +} + +static void log_caller(sd_bus_message *message, Manager *manager, const char *method) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *comm = NULL; + Unit *caller; + pid_t pid; + + assert(message); + assert(manager); + assert(method); + + if (sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID|SD_BUS_CREDS_AUGMENT|SD_BUS_CREDS_COMM, &creds) < 0) + return; + + /* We need at least the PID, otherwise there's nothing to log, the rest is optional */ + if (sd_bus_creds_get_pid(creds, &pid) < 0) + return; + + (void) sd_bus_creds_get_comm(creds, &comm); + caller = manager_get_unit_by_pid(manager, pid); + + log_info("%s requested from client PID " PID_FMT "%s%s%s%s%s%s...", + method, pid, + comm ? " ('" : "", strempty(comm), comm ? "')" : "", + caller ? " (unit " : "", caller ? caller->id : "", caller ? ")" : ""); +} + +static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_run_space("Refusing to reload", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_reload_daemon_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* Write a log message noting the unit or process who requested the Reload() */ + log_caller(message, m, "Reloading"); + + /* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */ + if (!ratelimit_below(&m->reload_ratelimit)) { + log_warning("Reloading request rejected due to rate limit."); + return sd_bus_error_setf(error, + SD_BUS_ERROR_LIMITS_EXCEEDED, + "Reload() request rejected due to rate limit."); + } + + /* Instead of sending the reply back right away, we just + * remember that we need to and then send it after the reload + * is finished. That way the caller knows when the reload + * finished. */ + + assert(!m->pending_reload_message); + r = sd_bus_message_new_method_return(message, &m->pending_reload_message); + if (r < 0) + return r; + + m->objective = MANAGER_RELOAD; + + return 1; +} + +static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_run_space("Refusing to reexecute", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_reload_daemon_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* Write a log message noting the unit or process who requested the Reexecute() */ + log_caller(message, m, "Reexecuting"); + + /* We don't send a reply back here, the client should + * just wait for us disconnecting. */ + + m->objective = MANAGER_REEXECUTE; + return 1; +} + +static int method_exit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + /* Exit() (in contrast to SetExitCode()) is actually allowed even if + * we are running on the host. It will fall back on reboot() in + * systemd-shutdown if it cannot do the exit() because it isn't a + * container. */ + + m->objective = MANAGER_EXIT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Reboot is only supported for system managers."); + + m->objective = MANAGER_REBOOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_soft_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *rt = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *root; + int r; + + assert(message); + + r = verify_run_space_permissive("soft reboot may fail", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &root); + if (r < 0) + return r; + + if (!isempty(root)) { + if (!path_is_valid(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "New root directory '%s' must be a valid path.", root); + if (!path_is_absolute(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "New root directory path '%s' is not absolute.", root); + + rt = strdup(root); + if (!rt) + return -ENOMEM; + } + + free_and_replace(m->switch_root, rt); + m->objective = MANAGER_SOFT_REBOOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Powering off is only supported for system managers."); + + m->objective = MANAGER_POWEROFF; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Halt is only supported for system managers."); + + m->objective = MANAGER_HALT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_kexec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "KExec is only supported for system managers."); + + m->objective = MANAGER_KEXEC; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *ri = NULL, *rt = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *root, *init; + int r; + + assert(message); + + r = verify_run_space_permissive("root switching may fail", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Root switching is only supported by system manager."); + + r = sd_bus_message_read(message, "ss", &root, &init); + if (r < 0) + return r; + + if (isempty(root)) + /* If path is not specified, default to "/sysroot" which is what we generally expect initrds + * to use */ + root = "/sysroot"; + else { + if (!path_is_valid(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "New root directory must be a valid path."); + + if (!path_is_absolute(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "New root path '%s' is not absolute.", root); + + r = path_is_root(root); + if (r < 0) + return sd_bus_error_set_errnof(error, r, + "Failed to check if new root directory '%s' is the same as old root: %m", + root); + if (r > 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "New root directory cannot be the old root directory."); + } + + /* Safety check */ + if (!in_initrd()) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Not in initrd, refusing switch-root operation."); + + r = path_is_os_tree(root); + if (r < 0) + return sd_bus_error_set_errnof(error, r, + "Failed to determine whether root path '%s' contains an OS tree: %m", + root); + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Specified switch root path '%s' does not seem to be an OS tree. os-release file is missing.", + root); + + if (!isempty(init)) { + if (!path_is_valid(init)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Path to init binary '%s' is not a valid path.", init); + + if (!path_is_absolute(init)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Path to init binary '%s' not absolute.", init); + + r = chase_and_access(init, root, CHASE_PREFIX_ROOT, X_OK, NULL); + if (r == -EACCES) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Init binary %s is not executable.", init); + if (r < 0) + return sd_bus_error_set_errnof(error, r, + "Could not resolve init executable %s: %m", init); + } + + rt = strdup(root); + if (!rt) + return -ENOMEM; + + if (!isempty(init)) { + ri = strdup(init); + if (!ri) + return -ENOMEM; + } + + free_and_replace(m->switch_root, rt); + free_and_replace(m->switch_root_init, ri); + + m->objective = MANAGER_SWITCH_ROOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **plus = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &plus); + if (r < 0) + return r; + if (!strv_env_is_valid(plus)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, NULL, plus); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unset_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **minus = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &minus); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(minus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid environment variable names or assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, minus, NULL); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unset_and_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **minus = NULL, **plus = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &minus); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &plus); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(minus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid environment variable names or assignments"); + if (!strv_env_is_valid(plus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid environment assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, minus, plus); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_exit_code(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + uint8_t code; + int r; + + assert(message); + + r = mac_selinux_access_check(message, "exit", error); + if (r < 0) + return r; + + r = sd_bus_message_read_basic(message, 'y', &code); + if (r < 0) + return r; + + m->return_value = code; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_lookup_dynamic_user_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + uid_t uid; + int r; + + assert(message); + + r = sd_bus_message_read_basic(message, 's', &name); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Dynamic users are only supported in the system instance."); + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "User name invalid: %s", name); + + r = dynamic_user_lookup_name(m, name, &uid); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER, + "Dynamic user %s does not exist.", name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "u", (uint32_t) uid); +} + +static int method_lookup_dynamic_user_by_uid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *name = NULL; + Manager *m = ASSERT_PTR(userdata); + uid_t uid; + int r; + + assert(message); + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + r = sd_bus_message_read_basic(message, 'u', &uid); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Dynamic users are only supported in the system instance."); + if (!uid_is_valid(uid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "User ID invalid: " UID_FMT, uid); + + r = dynamic_user_lookup_uid(m, uid, &name); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER, + "Dynamic user ID " UID_FMT " does not exist.", uid); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", name); +} + +static int method_get_dynamic_users(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + DynamicUser *d; + int r; + + assert(message); + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Dynamic users are only supported in the system instance."); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(us)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(d, m->dynamic_users) { + uid_t uid; + + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) /* not realized yet? */ + continue; + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, + "Failed to look up a dynamic user."); + + r = sd_bus_message_append(reply, "(us)", uid, d->name); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_enqueue_marked_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_access_check(message, "start", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + log_info("Queuing reload/restart jobs for marked units%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "o"); + if (r < 0) + return r; + + Unit *u; + char *k; + int ret = 0; + HASHMAP_FOREACH_KEY(u, k, m->units) { + /* ignore aliases */ + if (u->id != k) + continue; + + BusUnitQueueFlags flags; + if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RESTART)) + flags = 0; + else if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD)) + flags = BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE; + else + continue; + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r >= 0) + r = bus_unit_queue_job_one(message, u, + JOB_TRY_RESTART, JOB_FAIL, flags, + reply, error); + if (ERRNO_IS_NEG_RESOURCE(r)) + return r; + if (r < 0) { + if (ret >= 0) + ret = r; + sd_bus_error_free(error); + } + } + + if (ret < 0) + return sd_bus_error_set_errnof(error, ret, + "Failed to enqueue some jobs, see logs for details: %m"); + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + UnitFileList *item; + _cleanup_hashmap_free_ Hashmap *h = NULL; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + h = hashmap_new(&unit_file_list_hash_ops_free); + if (!h) + return -ENOMEM; + + r = unit_file_get_list(m->runtime_scope, NULL, h, states, patterns); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(item, h) { + + r = sd_bus_message_append(reply, "(ss)", item->path, unit_file_state_to_string(item->state)); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return list_unit_files_by_patterns(message, userdata, error, NULL, NULL); +} + +static int method_list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return list_unit_files_by_patterns(message, userdata, error, states, patterns); +} + +static int method_get_unit_file_state(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + UnitFileState state; + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = unit_file_get_state(m->runtime_scope, NULL, name, &state); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", unit_file_state_to_string(state)); +} + +static int method_get_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *default_target = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = unit_file_get_default(m->runtime_scope, NULL, &default_target); + if (r == -ERFKILL) + sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit file is masked."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", default_target); +} + +static int send_unit_files_changed(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, &message, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitFilesChanged"); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +/* Create an error reply, using the error information from changes[] + * if possible, and fall back to generating an error from error code c. + * The error message only describes the first error. + */ +static int install_error( + sd_bus_error *error, + int c, + InstallChange *changes, + size_t n_changes) { + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + for (size_t i = 0; i < n_changes; i++) + + /* When making changes here, make sure to also change install_changes_dump() in install.c. */ + + switch (changes[i].type) { + case 0 ... _INSTALL_CHANGE_TYPE_MAX: /* not errors */ + break; + + case -EEXIST: + if (changes[i].source) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "File %s already exists and is a symlink to %s.", + changes[i].path, changes[i].source); + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "File %s already exists.", + changes[i].path); + + case -ERFKILL: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, + "Unit file %s is masked.", changes[i].path); + + case -EADDRNOTAVAIL: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED, + "Unit %s is transient or generated.", changes[i].path); + + case -ETXTBSY: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_BAD_PATH, + "File %s is under the systemd unit hierarchy already.", changes[i].path); + + case -EBADSLT: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "Invalid specifier in %s.", changes[i].path); + + case -EIDRM: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "Destination unit %s is a non-template unit.", changes[i].path); + + case -EUCLEAN: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "\"%s\" is not a valid unit name.", + changes[i].path); + + case -ELOOP: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED, + "Refusing to operate on alias name or linked unit file: %s", + changes[i].path); + + case -EXDEV: + if (changes[i].source) + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "Cannot alias %s as %s.", + changes[i].source, changes[i].path); + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "Invalid unit reference %s.", changes[i].path); + + case -ENOENT: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, + "Unit file %s does not exist.", changes[i].path); + + case -EUNATCH: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "Cannot resolve specifiers in %s.", changes[i].path); + + default: + assert(changes[i].type < 0); /* other errors */ + return sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path); + } + + return c < 0 ? c : -EINVAL; +} + +static int reply_install_changes_and_free( + Manager *m, + sd_bus_message *message, + int carries_install_info, + InstallChange *changes, + size_t n_changes, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + bool bad = false, good = false; + int r; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + if (install_changes_have_modification(changes, n_changes)) { + r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m"); + } + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + if (carries_install_info >= 0) { + r = sd_bus_message_append(reply, "b", carries_install_info); + if (r < 0) + return r; + } + + r = sd_bus_message_open_container(reply, 'a', "(sss)"); + if (r < 0) + return r; + + for (size_t i = 0; i < n_changes; i++) { + + if (changes[i].type < 0) { + bad = true; + continue; + } + + r = sd_bus_message_append( + reply, "(sss)", + install_change_type_to_string(changes[i].type), + changes[i].path, + changes[i].source); + if (r < 0) + return r; + + good = true; + } + + /* If there was a failed change, and no successful change, then return the first failure as proper + * method call error. */ + if (bad && !good) + return install_error(error, 0, TAKE_PTR(changes), n_changes); + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_enable_unit_files_generic( + sd_bus_message *message, + Manager *m, + int (*call)(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes), + bool carries_install_info, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + InstallChange *changes = NULL; + size_t n_changes = 0; + UnitFileFlags flags; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "EnableUnitFilesWithFlags")) { + uint64_t raw_flags; + + r = sd_bus_message_read(message, "t", &raw_flags); + if (r < 0) + return r; + if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0) + return -EINVAL; + flags = raw_flags; + } else { + int runtime, force; + + r = sd_bus_message_read(message, "bb", &runtime, &force); + if (r < 0) + return r; + flags = unit_file_bools_to_flags(runtime, force); + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes); + m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error); +} + +static int method_enable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_enable, /* carries_install_info = */ true, error); +} + +static int method_enable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_enable, /* carries_install_info = */ true, error); +} + +static int method_reenable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_reenable, /* carries_install_info = */ true, error); +} + +static int method_link_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_link, /* carries_install_info = */ false, error); +} + +static int unit_file_preset_without_mode(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char **files, InstallChange **changes, size_t *n_changes) { + return unit_file_preset(scope, flags, root_dir, files, UNIT_FILE_PRESET_FULL, changes, n_changes); +} + +static int method_preset_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_preset_without_mode, /* carries_install_info = */ true, error); +} + +static int method_mask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_mask, /* carries_install_info = */ false, error); +} + +static int method_preset_unit_files_with_mode(sd_bus_message *message, void *userdata, sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + InstallChange *changes = NULL; + size_t n_changes = 0; + Manager *m = ASSERT_PTR(userdata); + UnitFilePresetMode preset_mode; + int runtime, force, r; + UnitFileFlags flags; + const char *mode; + + assert(message); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + if (isempty(mode)) + preset_mode = UNIT_FILE_PRESET_FULL; + else { + preset_mode = unit_file_preset_mode_from_string(mode); + if (preset_mode < 0) + return -EINVAL; + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_preset(m->runtime_scope, flags, NULL, l, preset_mode, &changes, &n_changes); + m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, r, changes, n_changes, error); +} + +static int method_disable_unit_files_generic( + sd_bus_message *message, + Manager *m, + int (*call)(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes), + bool carries_install_info, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + InstallChange *changes = NULL; + UnitFileFlags flags; + size_t n_changes = 0; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlags") || + sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlagsAndInstallInfo")) { + uint64_t raw_flags; + + r = sd_bus_message_read(message, "t", &raw_flags); + if (r < 0) + return r; + if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0 || + FLAGS_SET(raw_flags, UNIT_FILE_FORCE)) + return -EINVAL; + flags = raw_flags; + } else { + int runtime; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + flags = unit_file_bools_to_flags(runtime, false); + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes); + m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error); +} + +static int method_disable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ false, error); +} + +static int method_disable_unit_files_with_flags_and_install_info(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ true, error); +} + +static int method_disable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ false, error); +} + +static int method_unmask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_unmask, /* carries_install_info = */ false, error); +} + +static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + InstallChange *changes = NULL; + size_t n_changes = 0; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_revert(m->runtime_scope, NULL, l, &changes, &n_changes); + m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_set_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) { + InstallChange *changes = NULL; + size_t n_changes = 0; + Manager *m = ASSERT_PTR(userdata); + const char *name; + int force, r; + + assert(message); + + r = mac_selinux_access_check(message, "enable", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sb", &name, &force); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_set_default(m->runtime_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + InstallChange *changes = NULL; + size_t n_changes = 0; + Manager *m = ASSERT_PTR(userdata); + UnitFilePresetMode preset_mode; + const char *mode; + UnitFileFlags flags; + int force, runtime, r; + + assert(message); + + r = mac_selinux_access_check(message, "enable", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + if (isempty(mode)) + preset_mode = UNIT_FILE_PRESET_FULL; + else { + preset_mode = unit_file_preset_mode_from_string(mode); + if (preset_mode < 0) + return -EINVAL; + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_preset_all(m->runtime_scope, flags, NULL, preset_mode, &changes, &n_changes); + m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_add_dependency_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = ASSERT_PTR(userdata); + InstallChange *changes = NULL; + size_t n_changes = 0; + int runtime, force, r; + char *target, *type; + UnitDependency dep; + UnitFileFlags flags; + + assert(message); + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssbb", &target, &type, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + dep = unit_dependency_from_string(type); + if (dep < 0) + return -EINVAL; + + r = unit_file_add_dependency(m->runtime_scope, flags, NULL, l, target, dep, &changes, &n_changes); + m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_install_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_get_unit_file_links(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + InstallChange *changes = NULL; + size_t n_changes = 0, i; + const char *name; + int runtime, r; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + r = sd_bus_message_read(message, "sb", &name, &runtime); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, SD_BUS_TYPE_ARRAY, "s"); + if (r < 0) + return r; + + r = unit_file_disable(m->runtime_scope, + UNIT_FILE_DRY_RUN | (runtime ? UNIT_FILE_RUNTIME : 0), + NULL, STRV_MAKE(name), &changes, &n_changes); + if (r < 0) + return log_error_errno(r, "Failed to get file links for %s: %m", name); + + for (i = 0; i < n_changes; i++) + if (changes[i].type == INSTALL_CHANGE_UNLINK) { + r = sd_bus_message_append(reply, "s", changes[i].path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_job_waiting(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + uint32_t id; + Job *j; + int r; + + assert(message); + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + return bus_job_method_get_waiting_jobs(message, j, error); +} + +static int method_abandon_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + Unit *u; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + if (u->type != UNIT_SCOPE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Unit '%s' is not a scope unit, refusing.", name); + + return bus_scope_method_abandon(message, u, error); +} + +static int method_set_show_status(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + ShowStatus mode = _SHOW_STATUS_INVALID; + const char *t; + int r; + + assert(message); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + + if (!isempty(t)) { + mode = show_status_from_string(t); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid show status '%s'", t); + } + + manager_override_show_status(m, mode, "bus"); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_dump_unit_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_generic_unit_operation(message, userdata, error, bus_service_method_dump_file_descriptor_store, 0); +} + +const sd_bus_vtable bus_manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Version", "s", property_get_version, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Features", "s", property_get_features, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Virtualization", "s", property_get_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfidentialVirtualization", "s", property_get_confidential_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Architecture", "s", property_get_architecture, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Tainted", "s", property_get_tainted, 0, SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("FirmwareTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FIRMWARE]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("LoaderTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_LOADER]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("KernelTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_KERNEL]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0), + SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0), + SD_BUS_PROPERTY("NNames", "u", property_get_hashmap_size, offsetof(Manager, units), 0), + SD_BUS_PROPERTY("NFailedUnits", "u", property_get_set_size, offsetof(Manager, failed_units), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0), + SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0), + SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0), + SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0), + SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0), + SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ShowStatus", "b", property_get_show_status, 0, 0), + SD_BUS_PROPERTY("UnitPath", "as", NULL, offsetof(Manager, lookup_paths.search_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStandardOutput", "s", bus_property_get_exec_output, offsetof(Manager, defaults.std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStandardError", "s", bus_property_get_exec_output, offsetof(Manager, defaults.std_error), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogDevice", "s", property_get_watchdog_device, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogLastPingTimestamp", "t", property_get_watchdog_last_ping_realtime, 0, 0), + SD_BUS_PROPERTY("WatchdogLastPingTimestampMonotonic", "t", property_get_watchdog_last_ping_monotonic, 0, 0), + SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogUSec", "t", property_get_runtime_watchdog, property_set_runtime_watchdog, 0, 0), + SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreUSec", "t", property_get_pretimeout_watchdog, property_set_pretimeout_watchdog, 0, 0), + SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreGovernor", "s", property_get_pretimeout_watchdog_governor, property_set_pretimeout_watchdog_governor, 0, 0), + SD_BUS_WRITABLE_PROPERTY("RebootWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, 0), + /* The following item is an obsolete alias */ + SD_BUS_WRITABLE_PROPERTY("ShutdownWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_WRITABLE_PROPERTY("KExecWatchdogUSec", "t", property_get_kexec_watchdog, property_set_kexec_watchdog, 0, 0), + SD_BUS_WRITABLE_PROPERTY("ServiceWatchdogs", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, service_watchdogs), 0), + SD_BUS_PROPERTY("ControlGroup", "s", NULL, offsetof(Manager, cgroup_root), 0), + SD_BUS_PROPERTY("SystemState", "s", property_get_system_state, 0, 0), + SD_BUS_PROPERTY("ExitCode", "y", bus_property_get_unsigned, offsetof(Manager, return_value), 0), + SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutAbortUSec", "t", property_get_default_timeout_abort_usec, 0, 0), + SD_BUS_PROPERTY("DefaultDeviceTimeoutUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.device_timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.restart_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST), + /* The following two items are obsolete alias */ + SD_BUS_PROPERTY("DefaultStartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("DefaultStartLimitInterval", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("DefaultStartLimitBurst", "u", bus_property_get_unsigned, offsetof(Manager, defaults.start_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultCPUAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.cpu_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultBlockIOAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.blockio_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultIOAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.io_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultIPAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.ip_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultMemoryAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.memory_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTasksAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.tasks_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCPU", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCPUSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitFSIZE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitDATA", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitDATASoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSTACK", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCORE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCORESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRSS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRSSSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNOFILE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitAS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitASSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNPROC", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitLOCKS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNICE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNICESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTPRIO", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, defaults.tasks_max), 0), + SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.memory_pressure_threshold_usec), 0), + SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.memory_pressure_watch), 0), + SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CtrlAltDelBurstAction", "s", bus_property_get_emergency_action, offsetof(Manager, cad_burst_action), SD_BUS_VTABLE_PROPERTY_CONST), + + SD_BUS_METHOD_WITH_ARGS("GetUnit", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("o", unit), + method_get_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitByPID", + SD_BUS_ARGS("u", pid), + SD_BUS_RESULT("o", unit), + method_get_unit_by_pid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitByInvocationID", + SD_BUS_ARGS("ay", invocation_id), + SD_BUS_RESULT("o", unit), + method_get_unit_by_invocation_id, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitByControlGroup", + SD_BUS_ARGS("s", cgroup), + SD_BUS_RESULT("o", unit), + method_get_unit_by_control_group, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitByPIDFD", + SD_BUS_ARGS("h", pidfd), + SD_BUS_RESULT("o", unit, "s", unit_id, "ay", invocation_id), + method_get_unit_by_pidfd, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("LoadUnit", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("o", unit), + method_load_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("StartUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_start_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("StartUnitWithFlags", + SD_BUS_ARGS("s", name, "s", mode, "t", flags), + SD_BUS_RESULT("o", job), + method_start_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("StartUnitReplace", + SD_BUS_ARGS("s", old_unit, "s", new_unit, "s", mode), + SD_BUS_RESULT("o", job), + method_start_unit_replace, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("StopUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_stop_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReloadUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_reload_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RestartUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TryRestartUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_try_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReloadOrRestartUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_reload_or_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestartUnit", + SD_BUS_ARGS("s", name, "s", mode), + SD_BUS_RESULT("o", job), + method_reload_or_try_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("EnqueueUnitJob", + SD_BUS_ARGS("s", name, "s", job_type, "s", job_mode), + SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs), + method_enqueue_unit_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("KillUnit", + SD_BUS_ARGS("s", name, "s", whom, "i", signal), + SD_BUS_NO_RESULT, + method_kill_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("QueueSignalUnit", + SD_BUS_ARGS("s", name, "s", whom, "i", signal, "i", value), + SD_BUS_NO_RESULT, + method_kill_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CleanUnit", + SD_BUS_ARGS("s", name, "as", mask), + SD_BUS_NO_RESULT, + method_clean_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("FreezeUnit", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_freeze_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ThawUnit", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_thaw_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ResetFailedUnit", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_reset_failed_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetUnitProperties", + SD_BUS_ARGS("s", name, "b", runtime, "a(sv)", properties), + SD_BUS_NO_RESULT, + method_set_unit_properties, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("BindMountUnit", + SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir), + SD_BUS_NO_RESULT, + method_bind_mount_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MountImageUnit", + SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options), + SD_BUS_NO_RESULT, + method_mount_image_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RefUnit", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_ref_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("UnrefUnit", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_unref_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("StartTransientUnit", + SD_BUS_ARGS("s", name, "s", mode, "a(sv)", properties, "a(sa(sv))", aux), + SD_BUS_RESULT("o", job), + method_start_transient_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitProcesses", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("a(sus)", processes), + method_get_unit_processes, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AttachProcessesToUnit", + SD_BUS_ARGS("s", unit_name, "s", subcgroup, "au", pids), + SD_BUS_NO_RESULT, + method_attach_processes_to_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AbandonScope", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_abandon_scope, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetJob", + SD_BUS_ARGS("u", id), + SD_BUS_RESULT("o", job), + method_get_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetJobAfter", + SD_BUS_ARGS("u", id), + SD_BUS_RESULT("a(usssoo)", jobs), + method_get_job_waiting, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetJobBefore", + SD_BUS_ARGS("u", id), + SD_BUS_RESULT("a(usssoo)", jobs), + method_get_job_waiting, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CancelJob", + SD_BUS_ARGS("u", id), + SD_BUS_NO_RESULT, + method_cancel_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ClearJobs", + NULL, + NULL, + method_clear_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailed", + NULL, + NULL, + method_reset_failed, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetShowStatus", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + method_set_show_status, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUnits", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(ssssssouso)", units), + method_list_units, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUnitsFiltered", + SD_BUS_ARGS("as", states), + SD_BUS_RESULT("a(ssssssouso)", units), + method_list_units_filtered, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUnitsByPatterns", + SD_BUS_ARGS("as", states, "as", patterns), + SD_BUS_RESULT("a(ssssssouso)", units), + method_list_units_by_patterns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUnitsByNames", + SD_BUS_ARGS("as", names), + SD_BUS_RESULT("a(ssssssouso)", units), + method_list_units_by_names, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListJobs", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(usssoo)", jobs), + method_list_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Subscribe", + NULL, + NULL, + method_subscribe, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unsubscribe", + NULL, + NULL, + method_unsubscribe, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Dump", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", output), + method_dump, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatterns", + SD_BUS_ARGS("as", patterns), + SD_BUS_RESULT("s", output), + method_dump_units_matching_patterns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DumpByFileDescriptor", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("h", fd), + method_dump_by_fd, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatternsByFileDescriptor", + SD_BUS_ARGS("as", patterns), + SD_BUS_RESULT("h", fd), + method_dump_units_matching_patterns_by_fd, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CreateSnapshot", + SD_BUS_ARGS("s", name, "b", cleanup), + SD_BUS_RESULT("o", unit), + method_refuse_snapshot, + SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_METHOD_WITH_ARGS("RemoveSnapshot", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_refuse_snapshot, + SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_METHOD("Reload", + NULL, + NULL, + method_reload, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Reexecute", + NULL, + NULL, + method_reexecute, + SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_METHOD_NO_REPLY), + SD_BUS_METHOD("Exit", + NULL, + NULL, + method_exit, + 0), + SD_BUS_METHOD("Reboot", + NULL, + NULL, + method_reboot, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD_WITH_ARGS("SoftReboot", + SD_BUS_ARGS("s", new_root), + SD_BUS_NO_RESULT, + method_soft_reboot, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("PowerOff", + NULL, + NULL, + method_poweroff, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("Halt", + NULL, + NULL, + method_halt, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("KExec", + NULL, + NULL, + method_kexec, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD_WITH_ARGS("SwitchRoot", + SD_BUS_ARGS("s", new_root, "s", init), + SD_BUS_NO_RESULT, + method_switch_root, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD_WITH_ARGS("SetEnvironment", + SD_BUS_ARGS("as", assignments), + SD_BUS_NO_RESULT, + method_set_environment, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("UnsetEnvironment", + SD_BUS_ARGS("as", names), + SD_BUS_NO_RESULT, + method_unset_environment, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("UnsetAndSetEnvironment", + SD_BUS_ARGS("as", names, "as", assignments), + SD_BUS_NO_RESULT, + method_unset_and_set_environment, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("EnqueueMarkedJobs", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("ao", jobs), + method_enqueue_marked_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUnitFiles", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(ss)", unit_files), + method_list_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUnitFilesByPatterns", + SD_BUS_ARGS("as", states, "as", patterns), + SD_BUS_RESULT("a(ss)", unit_files), + method_list_unit_files_by_patterns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitFileState", + SD_BUS_ARGS("s", file), + SD_BUS_RESULT("s", state), + method_get_unit_file_state, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("EnableUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime, "b", force), + SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes), + method_enable_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DisableUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime), + SD_BUS_RESULT("a(sss)", changes), + method_disable_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("EnableUnitFilesWithFlags", + SD_BUS_ARGS("as", files, "t", flags), + SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes), + method_enable_unit_files_with_flags, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlags", + SD_BUS_ARGS("as", files, "t", flags), + SD_BUS_RESULT("a(sss)", changes), + method_disable_unit_files_with_flags, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlagsAndInstallInfo", + SD_BUS_ARGS("as", files, "t", flags), + SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes), + method_disable_unit_files_with_flags_and_install_info, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReenableUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime, "b", force), + SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes), + method_reenable_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("LinkUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime, "b", force), + SD_BUS_RESULT("a(sss)", changes), + method_link_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("PresetUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime, "b", force), + SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes), + method_preset_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("PresetUnitFilesWithMode", + SD_BUS_ARGS("as", files, "s", mode, "b", runtime, "b", force), + SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes), + method_preset_unit_files_with_mode, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MaskUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime, "b", force), + SD_BUS_RESULT("a(sss)", changes), + method_mask_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("UnmaskUnitFiles", + SD_BUS_ARGS("as", files, "b", runtime), + SD_BUS_RESULT("a(sss)", changes), + method_unmask_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RevertUnitFiles", + SD_BUS_ARGS("as", files), + SD_BUS_RESULT("a(sss)", changes), + method_revert_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDefaultTarget", + SD_BUS_ARGS("s", name, "b", force), + SD_BUS_RESULT("a(sss)", changes), + method_set_default_target, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetDefaultTarget", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", name), + method_get_default_target, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("PresetAllUnitFiles", + SD_BUS_ARGS("s", mode, "b", runtime, "b", force), + SD_BUS_RESULT("a(sss)", changes), + method_preset_all_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AddDependencyUnitFiles", + SD_BUS_ARGS("as", files, "s", target, "s", type, "b", runtime, "b", force), + SD_BUS_RESULT("a(sss)", changes), + method_add_dependency_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUnitFileLinks", + SD_BUS_ARGS("s", name, "b", runtime), + SD_BUS_RESULT("as", links), + method_get_unit_file_links, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetExitCode", + SD_BUS_ARGS("y", number), + SD_BUS_NO_RESULT, + method_set_exit_code, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByName", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("u", uid), + method_lookup_dynamic_user_by_name, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByUID", + SD_BUS_ARGS("u", uid), + SD_BUS_RESULT("s", name), + method_lookup_dynamic_user_by_uid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetDynamicUsers", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(us)", users), + method_get_dynamic_users, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DumpUnitFileDescriptorStore", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("a(suuutuusu)", entries), + method_dump_unit_descriptor_store, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_SIGNAL_WITH_ARGS("UnitNew", + SD_BUS_ARGS("s", id, "o", unit), + 0), + SD_BUS_SIGNAL_WITH_ARGS("UnitRemoved", + SD_BUS_ARGS("s", id, "o", unit), + 0), + SD_BUS_SIGNAL_WITH_ARGS("JobNew", + SD_BUS_ARGS("u", id, "o", job, "s", unit), + 0), + SD_BUS_SIGNAL_WITH_ARGS("JobRemoved", + SD_BUS_ARGS("u", id, "o", job, "s", unit, "s", result), + 0), + SD_BUS_SIGNAL_WITH_ARGS("StartupFinished", + SD_BUS_ARGS("t", firmware, "t", loader, "t", kernel, "t", initrd, "t", userspace, "t", total), + 0), + SD_BUS_SIGNAL("UnitFilesChanged", NULL, 0), + SD_BUS_SIGNAL_WITH_ARGS("Reloading", + SD_BUS_ARGS("b", active), + 0), + + SD_BUS_VTABLE_END +}; + +const sd_bus_vtable bus_manager_log_control_vtable[] = { + SD_BUS_VTABLE_START(0), + + /* We define a private version of this interface here, since we want slightly different + * implementations for the setters. We'll still use the generic getters however, and we share the + * setters with the implementations for the Manager interface above (which pre-dates the generic + * service API interface). */ + + SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0), + SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0), + SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0), + + SD_BUS_VTABLE_END, +}; + +static int send_finished(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + usec_t *times = ASSERT_PTR(userdata); + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, + &message, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "StartupFinished"); + if (r < 0) + return r; + + r = sd_bus_message_append(message, "tttttt", times[0], times[1], times[2], times[3], times[4], times[5]); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +void bus_manager_send_finished( + Manager *m, + usec_t firmware_usec, + usec_t loader_usec, + usec_t kernel_usec, + usec_t initrd_usec, + usec_t userspace_usec, + usec_t total_usec) { + + int r; + + assert(m); + + r = bus_foreach_bus( + m, + NULL, + send_finished, + (usec_t[6]) { + firmware_usec, + loader_usec, + kernel_usec, + initrd_usec, + userspace_usec, + total_usec + }); + if (r < 0) + log_debug_errno(r, "Failed to send finished signal: %m"); +} + +static int send_reloading(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "Reloading"); + if (r < 0) + return r; + + r = sd_bus_message_append(message, "b", PTR_TO_INT(userdata)); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +void bus_manager_send_reloading(Manager *m, bool active) { + int r; + + assert(m); + + r = bus_foreach_bus(m, NULL, send_reloading, INT_TO_PTR(active)); + if (r < 0) + log_debug_errno(r, "Failed to send reloading signal: %m"); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + assert(bus); + + return sd_bus_emit_properties_changed_strv(bus, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + NULL); +} + +void bus_manager_send_change_signal(Manager *m) { + int r; + + assert(m); + + r = bus_foreach_bus(m, NULL, send_changed_signal, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send manager change signal: %m"); +} diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h new file mode 100644 index 0000000..9b05080 --- /dev/null +++ b/src/core/dbus-manager.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus-vtable.h" + +#include "manager.h" + +extern const sd_bus_vtable bus_manager_vtable[]; +extern const sd_bus_vtable bus_manager_log_control_vtable[]; + +void bus_manager_send_finished(Manager *m, usec_t firmware_usec, usec_t loader_usec, usec_t kernel_usec, usec_t initrd_usec, usec_t userspace_usec, usec_t total_usec); +void bus_manager_send_reloading(Manager *m, bool active); +void bus_manager_send_change_signal(Manager *m); + +int verify_run_space_and_log(const char *message); + +int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_emergency_action(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c new file mode 100644 index 0000000..7dbbdd0 --- /dev/null +++ b/src/core/dbus-mount.c @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-mount.h" +#include "dbus-util.h" +#include "mount.h" +#include "string-util.h" +#include "unit.h" +#include "utf8.h" + +static int property_get_what( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *escaped = NULL; + Mount *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + escaped = mount_get_what_escaped(m); + if (!escaped) + return -ENOMEM; + + return sd_bus_message_append_basic(reply, 's', escaped); +} + +static int property_get_options( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *escaped = NULL; + Mount *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + escaped = mount_get_options_escaped(m); + if (!escaped) + return -ENOMEM; + + return sd_bus_message_append_basic(reply, 's', escaped); +} + +static BUS_DEFINE_PROPERTY_GET(property_get_type, "s", Mount, mount_get_fstype); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, mount_result, MountResult); + +const sd_bus_vtable bus_mount_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LazyUnmount", "b", bus_property_get_bool, offsetof(Mount, lazy_unmount), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ForceUnmount", "b", bus_property_get_bool, offsetof(Mount, force_unmount), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadWriteOnly", "b", bus_property_get_bool, offsetof(Mount, read_write_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static int bus_mount_set_transient_property( + Mount *m, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(m); + + assert(m); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Where")) + return bus_set_transient_path(u, name, &m->where, message, flags, error); + + if (streq(name, "What")) + return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error); + + if (streq(name, "Options")) + return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error); + + if (streq(name, "Type")) + return bus_set_transient_string(u, name, &m->parameters_fragment.fstype, message, flags, error); + + if (streq(name, "TimeoutUSec")) + return bus_set_transient_usec_fix_0(u, name, &m->timeout_usec, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &m->directory_mode, message, flags, error); + + if (streq(name, "SloppyOptions")) + return bus_set_transient_bool(u, name, &m->sloppy_options, message, flags, error); + + if (streq(name, "LazyUnmount")) + return bus_set_transient_bool(u, name, &m->lazy_unmount, message, flags, error); + + if (streq(name, "ForceUnmount")) + return bus_set_transient_bool(u, name, &m->force_unmount, message, flags, error); + + if (streq(name, "ReadWriteOnly")) + return bus_set_transient_bool(u, name, &m->read_write_only, message, flags, error); + + return 0; +} + +int bus_mount_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &m->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_mount_set_transient_property(m, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &m->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &m->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_mount_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-mount.h b/src/core/dbus-mount.h new file mode 100644 index 0000000..5a848d3 --- /dev/null +++ b/src/core/dbus-mount.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_mount_vtable[]; + +int bus_mount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_mount_commit_properties(Unit *u); diff --git a/src/core/dbus-path.c b/src/core/dbus-path.c new file mode 100644 index 0000000..8cb6a26 --- /dev/null +++ b/src/core/dbus-path.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "dbus-path.h" +#include "dbus-util.h" +#include "list.h" +#include "path.h" +#include "path-util.h" +#include "string-util.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, path_result, PathResult); + +static int property_get_paths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Path *p = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(spec, k, p->specs) { + r = sd_bus_message_append(reply, "(ss)", path_type_to_string(k->type), k->path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_path_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Paths", "a(ss)", property_get_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MakeDirectory", "b", bus_property_get_bool, offsetof(Path, make_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Path, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Path, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Path, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Path, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int bus_path_set_transient_property( + Path *p, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(p); + int r; + + assert(p); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "MakeDirectory")) + return bus_set_transient_bool(u, name, &p->make_directory, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &p->directory_mode, message, flags, error); + + if (streq(name, "Paths")) { + const char *type_name, *path; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &type_name, &path)) > 0) { + PathType t; + + t = path_type_from_string(type_name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown path type: %s", type_name); + + if (isempty(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is empty", type_name); + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is not absolute: %s", type_name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *k = NULL; + + r = path_simplify_alloc(path, &k); + if (r < 0) + return r; + + PathSpec *s = new(PathSpec, 1); + if (!s) + return -ENOMEM; + + *s = (PathSpec) { + .unit = u, + .path = TAKE_PTR(k), + .type = t, + .inotify_fd = -EBADF, + }; + + LIST_PREPEND(spec, p->specs, s); + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", type_name, path); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + path_free_specs(p); + unit_write_settingf(u, flags, name, "PathExists="); + } + + return 1; + } + + if (streq(name, "TriggerLimitBurst")) + return bus_set_transient_unsigned(u, name, &p->trigger_limit.burst, message, flags, error); + + if (streq(name, "TriggerLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &p->trigger_limit.interval, message, flags, error); + + return 0; +} + +int bus_path_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags mode, + sd_bus_error *error) { + + Path *p = PATH(u); + + assert(p); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) + return bus_path_set_transient_property(p, name, message, mode, error); + + return 0; +} diff --git a/src/core/dbus-path.h b/src/core/dbus-path.h new file mode 100644 index 0000000..b5018b0 --- /dev/null +++ b/src/core/dbus-path.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_path_vtable[]; + +int bus_path_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c new file mode 100644 index 0000000..78196a1 --- /dev/null +++ b/src/core/dbus-scope.c @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-kill.h" +#include "dbus-manager.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "scope.h" +#include "selinux-access.h" +#include "unit.h" + +int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Scope *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_unit_access_check(UNIT(s), message, "stop", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(UNIT(s)->manager, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = scope_abandon(s); + if (r == -ESTALE) + return sd_bus_error_setf(error, BUS_ERROR_SCOPE_NOT_RUNNING, "Scope %s is not running, cannot abandon.", UNIT(s)->id); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult); +static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string); + +const sd_bus_vtable bus_scope_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Controller", "s", NULL, offsetof(Scope, controller), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Scope, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_SIGNAL("RequestStop", NULL, 0), + SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_method_abandon, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; + +static int bus_scope_set_transient_property( + Scope *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(s); + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "TimeoutStopUSec")) + return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error); + + if (streq(name, "RuntimeMaxUSec")) + return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error); + + if (streq(name, "RuntimeRandomizedExtraUSec")) + return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error); + + if (streq(name, "OOMPolicy")) + return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error); + + if (streq(name, "PIDs")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + + for (;;) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + if (!creds) { + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + } + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return r; + + r = unit_pid_attachable(u, &pidref, error); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_watch_pidref(u, &pidref, /* exclusive= */ false); + if (r < 0 && r != -EEXIST) + return r; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return n <= 0 ? -EINVAL : 1; + } + + if (streq(name, "PIDFDs")) { + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "h"); + if (r < 0) + return r; + + for (;;) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + int fd; + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return r; + if (r == 0) + break; + + r = pidref_set_pidfd(&pidref, fd); + if (r < 0) + return r; + + r = unit_pid_attachable(u, &pidref, error); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_watch_pidref(u, &pidref, /* exclusive= */ false); + if (r < 0 && r != -EEXIST) + return r; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return n <= 0 ? -EINVAL : 1; + } + + if (streq(name, "Controller")) { + const char *controller; + + /* We can't support direct connections with this, as direct connections know no service or unique name + * concept, but the Controller field stores exactly that. */ + if (sd_bus_message_get_bus(message) != u->manager->api_bus) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Sorry, Controller= logic only supported via the bus."); + + r = sd_bus_message_read(message, "s", &controller); + if (r < 0) + return r; + + if (!isempty(controller) && !sd_bus_service_name_is_valid(controller)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Controller '%s' is not a valid bus name.", controller); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = free_and_strdup(&s->controller, empty_to_null(controller)); + if (r < 0) + return r; + } + + return 1; + } + + return 0; +} + +int bus_scope_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->load_state == UNIT_STUB) { + /* While we are created we still accept PIDs */ + + r = bus_scope_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + + if (streq(name, "User")) + return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error); + + if (streq(name, "Group")) + return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error); + } + + return 0; +} + +int bus_scope_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} + +int bus_scope_send_request_stop(Scope *s) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(s); + + if (!s->controller) + return 0; + + p = unit_dbus_path(UNIT(s)); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + UNIT(s)->manager->api_bus, + &m, + p, + "org.freedesktop.systemd1.Scope", + "RequestStop"); + if (r < 0) + return r; + + return sd_bus_send_to(UNIT(s)->manager->api_bus, m, s->controller, NULL); +} + +static int on_controller_gone(sd_bus_track *track, void *userdata) { + Scope *s = userdata; + + assert(track); + + if (s->controller) { + log_unit_debug(UNIT(s), "Controller %s disappeared from bus.", s->controller); + unit_add_to_dbus_queue(UNIT(s)); + s->controller = mfree(s->controller); + } + + s->controller_track = sd_bus_track_unref(s->controller_track); + + return 0; +} + +int bus_scope_track_controller(Scope *s) { + int r; + + assert(s); + + if (!s->controller || s->controller_track) + return 0; + + r = sd_bus_track_new(UNIT(s)->manager->api_bus, &s->controller_track, on_controller_gone, s); + if (r < 0) + return r; + + r = sd_bus_track_add_name(s->controller_track, s->controller); + if (r < 0) { + s->controller_track = sd_bus_track_unref(s->controller_track); + return r; + } + + return 0; +} diff --git a/src/core/dbus-scope.h b/src/core/dbus-scope.h new file mode 100644 index 0000000..8f1bc02 --- /dev/null +++ b/src/core/dbus-scope.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "scope.h" +#include "unit.h" + +extern const sd_bus_vtable bus_scope_vtable[]; + +int bus_scope_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_scope_commit_properties(Unit *u); + +int bus_scope_send_request_stop(Scope *s); + +int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_scope_track_controller(Scope *s); diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c new file mode 100644 index 0000000..cc478f4 --- /dev/null +++ b/src/core/dbus-service.c @@ -0,0 +1,791 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "async.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-manager.h" +#include "dbus-service.h" +#include "dbus-util.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "locale-util.h" +#include "missing_fcntl.h" +#include "mount-util.h" +#include "open-file.h" +#include "parse-util.h" +#include "path-util.h" +#include "selinux-access.h" +#include "service.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, service_type, ServiceType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exit_type, service_exit_type, ServiceExitType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, service_result, ServiceResult); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, ServiceRestart); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart_mode, service_restart_mode, ServiceRestartMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction); +static BUS_DEFINE_PROPERTY_GET2(property_get_notify_access, "s", Service, service_get_notify_access, notify_access_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_restart_usec_next, "t", Service, service_restart_usec_next); +static BUS_DEFINE_PROPERTY_GET(property_get_timeout_abort_usec, "t", Service, service_timeout_abort_usec); +static BUS_DEFINE_PROPERTY_GET(property_get_watchdog_usec, "t", Service, service_get_watchdog_usec); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode); + +static int property_get_open_files( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + OpenFile **open_files = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sst)"); + if (r < 0) + return r; + + LIST_FOREACH(open_files, of, *open_files) { + r = sd_bus_message_append(reply, "(sst)", of->path, of->fdname, (uint64_t) of->flags); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_exit_status_set( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const ExitStatusSet *status_set = ASSERT_PTR(userdata); + unsigned n; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "aiai"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "i"); + if (r < 0) + return r; + + BITMAP_FOREACH(n, &status_set->status) { + assert(n < 256); + + r = sd_bus_message_append_basic(reply, 'i', &n); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "i"); + if (r < 0) + return r; + + BITMAP_FOREACH(n, &status_set->signal) { + const char *str; + + str = signal_to_string(n); + if (!str) + continue; + + r = sd_bus_message_append_basic(reply, 'i', &n); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int bus_service_method_mount(sd_bus_message *message, void *userdata, sd_bus_error *error, bool is_image) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + const char *dest, *src, *propagate_directory; + int read_only, make_file_or_directory; + Unit *u = ASSERT_PTR(userdata); + ExecContext *c; + int r; + + assert(message); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Adding bind mounts at runtime is only supported for system managers."); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssbb", &src, &dest, &read_only, &make_file_or_directory); + if (r < 0) + return r; + + if (!path_is_absolute(src) || !path_is_normalized(src)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized."); + + if (!is_image && isempty(dest)) + dest = src; + else if (!path_is_absolute(dest) || !path_is_normalized(dest)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized."); + + if (is_image) { + r = bus_read_mount_options(message, error, &options, NULL, ""); + if (r < 0) + return r; + } + + r = bus_verify_manage_units_async_full( + u, + is_image ? "mount-image" : "bind-mount", + CAP_SYS_ADMIN, + N_("Authentication is required to mount on '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + if (u->type != UNIT_SERVICE) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not of type .service"); + + /* If it would be dropped at startup time, return an error. The context should always be available, but + * there's an assert in exec_needs_mount_namespace, so double-check just in case. */ + c = unit_get_exec_context(u); + if (!c) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot access unit execution context"); + if (path_startswith_strv(dest, c->inaccessible_paths)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s is not accessible to this unit", dest); + + /* Ensure that the unit was started in a private mount namespace */ + if (!exec_needs_mount_namespace(c, NULL, unit_get_exec_runtime(u))) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit not running in private mount namespace, cannot activate bind mount"); + + PidRef* unit_pid = unit_main_pid(u); + if (!pidref_is_set(unit_pid) || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not running"); + + propagate_directory = strjoina("/run/systemd/propagate/", u->id); + if (is_image) + r = mount_image_in_namespace( + unit_pid, + propagate_directory, + "/run/systemd/incoming/", + src, dest, + read_only, + make_file_or_directory, + options, + c->mount_image_policy ?: &image_policy_service); + else + r = bind_mount_in_namespace( + unit_pid, + propagate_directory, + "/run/systemd/incoming/", + src, dest, + read_only, + make_file_or_directory); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in unit's namespace: %m", src, dest); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_service_method_mount(message, userdata, error, false); +} + +int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_service_method_mount(message, userdata, error, true); +} + +int bus_service_method_dump_file_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Service *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_unit_access_check(UNIT(s), message, "status", error); + if (r < 0) + return r; + + if (s->n_fd_store_max == 0 && s->n_fd_store == 0) + return sd_bus_error_setf(error, BUS_ERROR_FILE_DESCRIPTOR_STORE_DISABLED, "File descriptor store not enabled for %s.", UNIT(s)->id); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(suuutuusu)"); + if (r < 0) + return r; + + LIST_FOREACH(fd_store, i, s->fd_store) { + _cleanup_free_ char *path = NULL; + struct stat st; + int flags; + + if (fstat(i->fd, &st) < 0) { + log_debug_errno(errno, "Failed to stat() file descriptor entry '%s', skipping.", strna(i->fdname)); + continue; + } + + flags = fcntl(i->fd, F_GETFL); + if (flags < 0) { + log_debug_errno(errno, "Failed to issue F_GETFL on file descriptor entry '%s', skipping.", strna(i->fdname)); + continue; + } + + /* glibc implies O_LARGEFILE everywhere on 64-bit off_t builds, but forgets to hide it away on + * F_GETFL, but provides no definition to check for that. Let's mask the flag away manually, + * to not confuse clients. */ + flags &= ~RAW_O_LARGEFILE; + + (void) fd_get_path(i->fd, &path); + + r = sd_bus_message_append( + reply, + "(suuutuusu)", + i->fdname, + (uint32_t) st.st_mode, + (uint32_t) major(st.st_dev), (uint32_t) minor(st.st_dev), + (uint64_t) st.st_ino, + (uint32_t) major(st.st_rdev), (uint32_t) minor(st.st_rdev), + path, + (uint32_t) flags); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +#if __SIZEOF_SIZE_T__ == 8 +static int property_get_size_as_uint32( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + size_t *value = ASSERT_PTR(userdata); + uint32_t sz = *value >= UINT32_MAX ? UINT32_MAX : (uint32_t) *value; + + /* Returns a size_t as a D-Bus "u" type, i.e. as 32-bit value, even if size_t is 64-bit. We'll saturate if it doesn't fit. */ + + return sd_bus_message_append_basic(reply, 'u', &sz); +} +#elif __SIZEOF_SIZE_T__ == 4 +#define property_get_size_as_uint32 ((sd_bus_property_get_t) NULL) +#else +#error "Unexpected size of size_t" +#endif + +const sd_bus_vtable bus_service_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ExitType", "s", property_get_exit_type, offsetof(Service, exit_type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Restart", "s", property_get_restart, offsetof(Service, restart), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartMode", "s", property_get_restart_mode, offsetof(Service, restart_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PIDFile", "s", NULL, offsetof(Service, pid_file), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NotifyAccess", "s", property_get_notify_access, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartSteps", "u", bus_property_get_unsigned, offsetof(Service, restart_steps), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartMaxDelayUSec", "t", bus_property_get_usec, offsetof(Service, restart_max_delay_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartUSecNext", "t", property_get_restart_usec_next, 0, 0), + SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutAbortUSec", "t", property_get_timeout_abort_usec, 0, 0), + SD_BUS_PROPERTY("TimeoutStartFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_start_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStopFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_stop_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Service, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogUSec", "t", property_get_watchdog_usec, 0, 0), + BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0), + SD_BUS_PROPERTY("PermissionsStartOnly", "b", bus_property_get_bool, offsetof(Service, permissions_start_only), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* 😷 deprecated */ + SD_BUS_PROPERTY("RootDirectoryStartOnly", "b", bus_property_get_bool, offsetof(Service, root_directory_start_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemainAfterExit", "b", bus_property_get_bool, offsetof(Service, remain_after_exit), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("GuessMainPID", "b", bus_property_get_bool, offsetof(Service, guess_main_pid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartPreventExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_prevent_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartForceExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_force_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, success_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NFileDescriptorStore", "u", property_get_size_as_uint32, offsetof(Service, n_fd_store), 0), + SD_BUS_PROPERTY("FileDescriptorStorePreserve", "s", bus_property_get_exec_preserve_mode, offsetof(Service, fd_store_preserve_mode), 0), + SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("StatusErrno", "i", bus_property_get_int, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ReloadResult", "s", property_get_result, offsetof(Service, reload_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CleanResult", "s", property_get_result, offsetof(Service, clean_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OpenFile", "a(sst)", property_get_open_files, offsetof(Service, open_files), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReloadSignal", "i", bus_property_get_int, offsetof(Service, reload_signal), SD_BUS_VTABLE_PROPERTY_CONST), + + BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecCondition", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecConditionEx", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPreEx", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartEx", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPostEx", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecReload", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecReloadEx", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStop", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopPostEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + + SD_BUS_METHOD_WITH_ARGS("BindMount", + SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir), + SD_BUS_NO_RESULT, + bus_service_method_bind_mount, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_METHOD_WITH_ARGS("MountImage", + SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options), + SD_BUS_NO_RESULT, + bus_service_method_mount_image, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_METHOD_WITH_ARGS("DumpFileDescriptorStore", + SD_BUS_NO_ARGS, + SD_BUS_ARGS("a(suuutuusu)", entries), + bus_service_method_dump_file_descriptor_store, + SD_BUS_VTABLE_UNPRIVILEGED), + + /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */ + SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_VTABLE_END +}; + +static int bus_set_transient_exit_status( + Unit *u, + const char *name, + ExitStatusSet *status_set, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const int32_t *status, *signal; + size_t n_status, n_signal, i; + int r; + + r = sd_bus_message_enter_container(message, 'r', "aiai"); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'i', (const void **) &status, &n_status); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'i', (const void **) &signal, &n_signal); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n_status /= sizeof(int32_t); + n_signal /= sizeof(int32_t); + + if (n_status == 0 && n_signal == 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) { + exit_status_set_free(status_set); + unit_write_settingf(u, flags, name, "%s=", name); + return 1; + } + + for (i = 0; i < n_status; i++) { + if (status[i] < 0 || status[i] > 255) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid status code in %s: %"PRIi32, name, status[i]); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bitmap_set(&status_set->status, status[i]); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%"PRIi32, name, status[i]); + } + } + + for (i = 0; i < n_signal; i++) { + const char *str; + + str = signal_to_string((int) signal[i]); + if (!str) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal in %s: %"PRIi32, name, signal[i]); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bitmap_set(&status_set->signal, signal[i]); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, str); + } + } + + return 1; +} + +static int bus_set_transient_std_fd( + Unit *u, + const char *name, + int *p, + bool *b, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int fd, r; + + assert(p); + assert(b); + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + asynchronous_close(*p); + *p = copy; + *b = true; + } + + return 1; +} +static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_exit_type, ServiceExitType, service_exit_type_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart_mode, ServiceRestartMode, service_restart_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, sd_bus_service_name_is_valid); +static BUS_DEFINE_SET_TRANSIENT_PARSE(timeout_failure_mode, ServiceTimeoutFailureMode, service_timeout_failure_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(reload_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); + +static int bus_service_set_transient_property( + Service *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(s); + ServiceExecCommand ci; + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "PermissionsStartOnly")) + return bus_set_transient_bool(u, name, &s->permissions_start_only, message, flags, error); + + if (streq(name, "RootDirectoryStartOnly")) + return bus_set_transient_bool(u, name, &s->root_directory_start_only, message, flags, error); + + if (streq(name, "RemainAfterExit")) + return bus_set_transient_bool(u, name, &s->remain_after_exit, message, flags, error); + + if (streq(name, "GuessMainPID")) + return bus_set_transient_bool(u, name, &s->guess_main_pid, message, flags, error); + + if (streq(name, "Type")) + return bus_set_transient_service_type(u, name, &s->type, message, flags, error); + + if (streq(name, "ExitType")) + return bus_set_transient_service_exit_type(u, name, &s->exit_type, message, flags, error); + + if (streq(name, "OOMPolicy")) + return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error); + + if (streq(name, "RestartUSec")) + return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error); + + if (streq(name, "RestartSteps")) + return bus_set_transient_unsigned(u, name, &s->restart_steps, message, flags, error); + + if (streq(name, "RestartMaxDelayUSec")) + return bus_set_transient_usec(u, name, &s->restart_max_delay_usec, message, flags, error); + + if (streq(name, "TimeoutStartUSec")) { + r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + s->start_timeout_defined = true; + + return r; + } + + if (streq(name, "TimeoutStopUSec")) + return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error); + + if (streq(name, "TimeoutAbortUSec")) { + r = bus_set_transient_usec(u, name, &s->timeout_abort_usec, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + s->timeout_abort_set = true; + return r; + } + + if (streq(name, "TimeoutStartFailureMode")) + return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_start_failure_mode, message, flags, error); + + if (streq(name, "TimeoutStopFailureMode")) + return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_stop_failure_mode, message, flags, error); + + if (streq(name, "RuntimeMaxUSec")) + return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error); + + if (streq(name, "RuntimeRandomizedExtraUSec")) + return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error); + + if (streq(name, "WatchdogUSec")) + return bus_set_transient_usec(u, name, &s->watchdog_usec, message, flags, error); + + if (streq(name, "FileDescriptorStoreMax")) + return bus_set_transient_unsigned(u, name, &s->n_fd_store_max, message, flags, error); + + if (streq(name, "FileDescriptorStorePreserve")) + return bus_set_transient_exec_preserve_mode(u, name, &s->fd_store_preserve_mode, message, flags, error); + + if (streq(name, "NotifyAccess")) + return bus_set_transient_notify_access(u, name, &s->notify_access, message, flags, error); + + if (streq(name, "PIDFile")) { + _cleanup_free_ char *n = NULL; + const char *v, *e; + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!isempty(v)) { + n = path_make_absolute(v, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!n) + return -ENOMEM; + + path_simplify(n); + + if (!path_is_normalized(n)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PIDFile= path '%s' is not valid", n); + + e = path_startswith(n, "/var/run/"); + if (e) { + char *z; + + z = path_join("/run", e); + if (!z) + return log_oom(); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + log_unit_notice(u, "Transient unit's PIDFile= property references path below legacy directory /var/run, updating %s %s %s; please update client accordingly.", + n, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), z); + + free_and_replace(n, z); + } + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + free_and_replace(s->pid_file, n); + unit_write_settingf(u, flags, name, "%s=%s", name, strempty(s->pid_file)); + } + + return 1; + } + + if (streq(name, "USBFunctionDescriptors")) + return bus_set_transient_path(u, name, &s->usb_function_descriptors, message, flags, error); + + if (streq(name, "USBFunctionStrings")) + return bus_set_transient_path(u, name, &s->usb_function_strings, message, flags, error); + + if (streq(name, "BusName")) + return bus_set_transient_bus_name(u, name, &s->bus_name, message, flags, error); + + if (streq(name, "Restart")) + return bus_set_transient_service_restart(u, name, &s->restart, message, flags, error); + + if (streq(name, "RestartMode")) + return bus_set_transient_service_restart_mode(u, name, &s->restart_mode, message, flags, error); + + if (streq(name, "RestartPreventExitStatus")) + return bus_set_transient_exit_status(u, name, &s->restart_prevent_status, message, flags, error); + + if (streq(name, "RestartForceExitStatus")) + return bus_set_transient_exit_status(u, name, &s->restart_force_status, message, flags, error); + + if (streq(name, "SuccessExitStatus")) + return bus_set_transient_exit_status(u, name, &s->success_status, message, flags, error); + + ci = service_exec_command_from_string(name); + if (ci < 0) + ci = service_exec_ex_command_from_string(name); + if (ci >= 0) + return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error); + + if (streq(name, "StandardInputFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stdin_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "StandardOutputFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stdout_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "StandardErrorFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stderr_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "OpenFile")) { + const char *path, *fdname; + uint64_t offlags; + + r = sd_bus_message_enter_container(message, 'a', "(sst)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(sst)", &path, &fdname, &offlags)) > 0) { + _cleanup_(open_file_freep) OpenFile *of = NULL; + _cleanup_free_ char *ofs = NULL; + + of = new(OpenFile, 1); + if (!of) + return -ENOMEM; + + *of = (OpenFile) { + .path = strdup(path), + .fdname = strdup(fdname), + .flags = offlags, + }; + + if (!of->path || !of->fdname) + return -ENOMEM; + + r = open_file_validate(of); + if (r < 0) + return r; + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + continue; + + r = open_file_to_string(of, &ofs); + if (r < 0) + return sd_bus_error_set_errnof( + error, r, "Failed to convert OpenFile= value to string: %m"); + + LIST_APPEND(open_files, s->open_files, TAKE_PTR(of)); + unit_write_settingf(u, flags | UNIT_ESCAPE_SPECIFIERS, name, "OpenFile=%s", ofs); + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 1; + } + + if (streq(name, "ReloadSignal")) + return bus_set_transient_reload_signal(u, name, &s->reload_signal, message, flags, error); + + return 0; +} + +int bus_service_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Service *s = SERVICE(u); + int r; + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's allow a little more */ + + r = bus_service_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_service_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h new file mode 100644 index 0000000..aea6cf7 --- /dev/null +++ b/src/core/dbus-service.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_service_vtable[]; + +int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_service_commit_properties(Unit *u); +int bus_service_method_dump_file_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c new file mode 100644 index 0000000..de41d65 --- /dev/null +++ b/src/core/dbus-slice.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-cgroup.h" +#include "dbus-slice.h" +#include "slice.h" +#include "unit.h" + +const sd_bus_vtable bus_slice_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_VTABLE_END +}; + +int bus_slice_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Slice *s = SLICE(u); + + assert(name); + assert(u); + + return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); +} + +int bus_slice_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-slice.h b/src/core/dbus-slice.h new file mode 100644 index 0000000..eb71916 --- /dev/null +++ b/src/core/dbus-slice.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_slice_vtable[]; + +int bus_slice_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_slice_commit_properties(Unit *u); diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c new file mode 100644 index 0000000..e77e9e5 --- /dev/null +++ b/src/core/dbus-socket.c @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-socket.h" +#include "dbus-util.h" +#include "fd-util.h" +#include "ip-protocol-list.h" +#include "parse-util.h" +#include "path-util.h" +#include "socket.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "string-util.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, socket_result, SocketResult); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_bind_ipv6_only, socket_address_bind_ipv6_only, SocketAddressBindIPv6Only); +static BUS_DEFINE_PROPERTY_GET(property_get_fdname, "s", Socket, socket_fdname); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timestamping, socket_timestamping, SocketTimestamping); + +static int property_get_listen( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Socket *s = SOCKET(userdata); + int r; + + assert(bus); + assert(reply); + assert(s); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(port, p, s->ports) { + _cleanup_free_ char *address = NULL; + + r = socket_port_to_address(p, &address); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "(ss)", socket_port_type_to_string(p), address); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_socket_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("BindIPv6Only", "s", property_get_bind_ipv6_only, offsetof(Socket, bind_ipv6_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Backlog", "u", bus_property_get_unsigned, offsetof(Socket, backlog), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Socket, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindToDevice", "s", NULL, offsetof(Socket, bind_to_device), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketUser", "s", NULL, offsetof(Socket, user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketGroup", "s", NULL, offsetof(Socket, group), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketMode", "u", bus_property_get_mode, offsetof(Socket, socket_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Socket, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Accept", "b", bus_property_get_bool, offsetof(Socket, accept), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FlushPending", "b", bus_property_get_bool, offsetof(Socket, flush_pending), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Writable", "b", bus_property_get_bool, offsetof(Socket, writable), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAlive", "b", bus_property_get_bool, offsetof(Socket, keep_alive), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveTimeUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_time), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveProbes", "u", bus_property_get_unsigned, offsetof(Socket, keep_alive_cnt), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DeferAcceptUSec" , "t", bus_property_get_usec, offsetof(Socket, defer_accept), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoDelay", "b", bus_property_get_bool, offsetof(Socket, no_delay), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Priority", "i", bus_property_get_int, offsetof(Socket, priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReceiveBuffer", "t", bus_property_get_size, offsetof(Socket, receive_buffer), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendBuffer", "t", bus_property_get_size, offsetof(Socket, send_buffer), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPTOS", "i", bus_property_get_int, offsetof(Socket, ip_tos), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPTTL", "i", bus_property_get_int, offsetof(Socket, ip_ttl), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PipeSize", "t", bus_property_get_size, offsetof(Socket, pipe_size), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FreeBind", "b", bus_property_get_bool, offsetof(Socket, free_bind), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveOnStop", "b", bus_property_get_bool, offsetof(Socket, remove_on_stop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Listen", "a(ss)", property_get_listen, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Symlinks", "as", NULL, offsetof(Socket, symlinks), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Mark", "i", bus_property_get_int, offsetof(Socket, mark), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MaxConnections", "u", bus_property_get_unsigned, offsetof(Socket, max_connections), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MaxConnectionsPerSource", "u", bus_property_get_unsigned, offsetof(Socket, max_connections_per_source), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MessageQueueMaxMessages", "x", bus_property_get_long, offsetof(Socket, mq_maxmsg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MessageQueueMessageSize", "x", bus_property_get_long, offsetof(Socket, mq_msgsize), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TCPCongestion", "s", NULL, offsetof(Socket, tcp_congestion), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReusePort", "b", bus_property_get_bool, offsetof(Socket, reuse_port), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabel", "s", NULL, offsetof(Socket, smack), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabelIPIn", "s", NULL, offsetof(Socket, smack_ip_in), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabelIPOut", "s", NULL, offsetof(Socket, smack_ip_out), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Socket, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Socket, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NConnections", "u", bus_property_get_unsigned, offsetof(Socket, n_connections), 0), + SD_BUS_PROPERTY("NAccepted", "u", bus_property_get_unsigned, offsetof(Socket, n_accepted), 0), + SD_BUS_PROPERTY("NRefused", "u", bus_property_get_unsigned, offsetof(Socket, n_refused), 0), + SD_BUS_PROPERTY("FileDescriptorName", "s", property_get_fdname, 0, 0), + SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PollLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, poll_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PollLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, poll_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static bool check_size_t_truncation(uint64_t t) { + return (size_t) t == t; +} + +static const char* socket_protocol_to_string(int32_t i) { + if (i == IPPROTO_IP) + return ""; + + if (!IN_SET(i, IPPROTO_UDPLITE, IPPROTO_SCTP)) + return NULL; + + return ip_protocol_to_name(i); +} + +static BUS_DEFINE_SET_TRANSIENT(int, "i", int32_t, int, "%" PRIi32); +static BUS_DEFINE_SET_TRANSIENT(message_queue, "x", int64_t, long, "%" PRIi64); +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(size_t_check_truncation, "t", uint64_t, size_t, "%" PRIu64, check_size_t_truncation); +static BUS_DEFINE_SET_TRANSIENT_PARSE(bind_ipv6_only, SocketAddressBindIPv6Only, socket_address_bind_ipv6_only_or_bool_from_string); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(fdname, fdname_is_valid); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(ifname, ifname_valid); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(ip_tos, "i", int32_t, int, "%" PRIi32, ip_tos_to_string_alloc); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(socket_protocol, "i", int32_t, int, "%" PRIi32, socket_protocol_to_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(socket_timestamping, SocketTimestamping, socket_timestamping_from_string_harder); + +static int bus_socket_set_transient_property( + Socket *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + SocketExecCommand ci; + Unit *u = UNIT(s); + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Accept")) + return bus_set_transient_bool(u, name, &s->accept, message, flags, error); + + if (streq(name, "FlushPending")) + return bus_set_transient_bool(u, name, &s->flush_pending, message, flags, error); + + if (streq(name, "Writable")) + return bus_set_transient_bool(u, name, &s->writable, message, flags, error); + + if (streq(name, "KeepAlive")) + return bus_set_transient_bool(u, name, &s->keep_alive, message, flags, error); + + if (streq(name, "NoDelay")) + return bus_set_transient_bool(u, name, &s->no_delay, message, flags, error); + + if (streq(name, "FreeBind")) + return bus_set_transient_bool(u, name, &s->free_bind, message, flags, error); + + if (streq(name, "Transparent")) + return bus_set_transient_bool(u, name, &s->transparent, message, flags, error); + + if (streq(name, "Broadcast")) + return bus_set_transient_bool(u, name, &s->broadcast, message, flags, error); + + if (streq(name, "PassCredentials")) + return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error); + + if (streq(name, "PassSecurity")) + return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error); + + if (streq(name, "PassPacketInfo")) + return bus_set_transient_bool(u, name, &s->pass_pktinfo, message, flags, error); + + if (streq(name, "Timestamping")) + return bus_set_transient_socket_timestamping(u, name, &s->timestamping, message, flags, error); + + if (streq(name, "ReusePort")) + return bus_set_transient_bool(u, name, &s->reuse_port, message, flags, error); + + if (streq(name, "RemoveOnStop")) + return bus_set_transient_bool(u, name, &s->remove_on_stop, message, flags, error); + + if (streq(name, "SELinuxContextFromNet")) + return bus_set_transient_bool(u, name, &s->selinux_context_from_net, message, flags, error); + + if (streq(name, "Priority")) + return bus_set_transient_int(u, name, &s->priority, message, flags, error); + + if (streq(name, "IPTTL")) + return bus_set_transient_int(u, name, &s->ip_ttl, message, flags, error); + + if (streq(name, "Mark")) + return bus_set_transient_int(u, name, &s->mark, message, flags, error); + + if (streq(name, "Backlog")) + return bus_set_transient_unsigned(u, name, &s->backlog, message, flags, error); + + if (streq(name, "MaxConnections")) + return bus_set_transient_unsigned(u, name, &s->max_connections, message, flags, error); + + if (streq(name, "MaxConnectionsPerSource")) + return bus_set_transient_unsigned(u, name, &s->max_connections_per_source, message, flags, error); + + if (streq(name, "KeepAliveProbes")) + return bus_set_transient_unsigned(u, name, &s->keep_alive_cnt, message, flags, error); + + if (streq(name, "TriggerLimitBurst")) + return bus_set_transient_unsigned(u, name, &s->trigger_limit.burst, message, flags, error); + + if (streq(name, "PollLimitBurst")) + return bus_set_transient_unsigned(u, name, &s->poll_limit_burst, message, flags, error); + + if (streq(name, "SocketMode")) + return bus_set_transient_mode_t(u, name, &s->socket_mode, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &s->directory_mode, message, flags, error); + + if (streq(name, "MessageQueueMaxMessages")) + return bus_set_transient_message_queue(u, name, &s->mq_maxmsg, message, flags, error); + + if (streq(name, "MessageQueueMessageSize")) + return bus_set_transient_message_queue(u, name, &s->mq_msgsize, message, flags, error); + + if (streq(name, "TimeoutUSec")) + return bus_set_transient_usec_fix_0(u, name, &s->timeout_usec, message, flags, error); + + if (streq(name, "KeepAliveTimeUSec")) + return bus_set_transient_usec(u, name, &s->keep_alive_time, message, flags, error); + + if (streq(name, "KeepAliveIntervalUSec")) + return bus_set_transient_usec(u, name, &s->keep_alive_interval, message, flags, error); + + if (streq(name, "DeferAcceptUSec")) + return bus_set_transient_usec(u, name, &s->defer_accept, message, flags, error); + + if (streq(name, "TriggerLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &s->trigger_limit.interval, message, flags, error); + + if (streq(name, "PollLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &s->poll_limit_interval, message, flags, error); + + if (streq(name, "SmackLabel")) + return bus_set_transient_string(u, name, &s->smack, message, flags, error); + + if (streq(name, "SmackLabelIPin")) + return bus_set_transient_string(u, name, &s->smack_ip_in, message, flags, error); + + if (streq(name, "SmackLabelIPOut")) + return bus_set_transient_string(u, name, &s->smack_ip_out, message, flags, error); + + if (streq(name, "TCPCongestion")) + return bus_set_transient_string(u, name, &s->tcp_congestion, message, flags, error); + + if (streq(name, "FileDescriptorName")) + return bus_set_transient_fdname(u, name, &s->fdname, message, flags, error); + + if (streq(name, "SocketUser")) + return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error); + + if (streq(name, "SocketGroup")) + return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error); + + if (streq(name, "BindIPv6Only")) + return bus_set_transient_bind_ipv6_only(u, name, &s->bind_ipv6_only, message, flags, error); + + if (streq(name, "ReceiveBuffer")) + return bus_set_transient_size_t_check_truncation(u, name, &s->receive_buffer, message, flags, error); + + if (streq(name, "SendBuffer")) + return bus_set_transient_size_t_check_truncation(u, name, &s->send_buffer, message, flags, error); + + if (streq(name, "PipeSize")) + return bus_set_transient_size_t_check_truncation(u, name, &s->pipe_size, message, flags, error); + + if (streq(name, "BindToDevice")) + return bus_set_transient_ifname(u, name, &s->bind_to_device, message, flags, error); + + if (streq(name, "IPTOS")) + return bus_set_transient_ip_tos(u, name, &s->ip_tos, message, flags, error); + + if (streq(name, "SocketProtocol")) + return bus_set_transient_socket_protocol(u, name, &s->socket_protocol, message, flags, error); + + ci = socket_exec_command_from_string(name); + if (ci >= 0) + return bus_set_transient_exec_command(u, name, + &s->exec_command[ci], + message, flags, error); + + if (streq(name, "Symlinks")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) + if (!path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Symlink path is not absolute: %s", *p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + s->symlinks = strv_free(s->symlinks); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&s->symlinks, l, true); + if (r < 0) + return -ENOMEM; + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (streq(name, "Listen")) { + const char *t, *a; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &t, &a)) > 0) { + _cleanup_(socket_port_freep) SocketPort *p = NULL; + + p = new(SocketPort, 1); + if (!p) + return log_oom(); + + *p = (SocketPort) { + .fd = -EBADF, + .socket = s, + }; + + p->type = socket_port_type_from_string(t); + if (p->type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown Socket type: %s", t); + + if (p->type != SOCKET_SOCKET) { + if (!path_is_absolute(a) || !path_is_valid(a)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid socket path: %s", a); + + r = path_simplify_alloc(a, &p->path); + if (r < 0) + return r; + + } else if (streq(t, "Netlink")) { + r = socket_address_parse_netlink(&p->address, a); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid netlink address: %s", a); + + } else { + r = socket_address_parse(&p->address, a); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address: %s", a); + + p->address.type = socket_address_type_from_string(t); + if (p->address.type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address type: %s", t); + + if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Address family not supported: %s", a); + } + + empty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + LIST_APPEND(port, s->ports, TAKE_PTR(p)); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Listen%s=%s", t, a); + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + socket_free_ports(s); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ListenStream="); + } + + return 1; + } + + return 0; +} + +int bus_socket_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(name); + assert(message); + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_socket_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_socket_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-socket.h b/src/core/dbus-socket.h new file mode 100644 index 0000000..f9f36a2 --- /dev/null +++ b/src/core/dbus-socket.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_socket_vtable[]; + +int bus_socket_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_socket_commit_properties(Unit *u); diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c new file mode 100644 index 0000000..7230352 --- /dev/null +++ b/src/core/dbus-swap.c @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-swap.h" +#include "string-util.h" +#include "swap.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET(property_get_priority, "i", Swap, swap_get_priority); +static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Swap, swap_get_options); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, swap_result, SwapResult); + +const sd_bus_vtable bus_swap_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("What", "s", NULL, offsetof(Swap, what), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Priority", "i", property_get_priority, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +int bus_swap_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Swap *s = SWAP(u); + + assert(s); + assert(name); + assert(message); + + return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); +} + +int bus_swap_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-swap.h b/src/core/dbus-swap.h new file mode 100644 index 0000000..9d651b5 --- /dev/null +++ b/src/core/dbus-swap.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_swap_vtable[]; + +int bus_swap_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_swap_commit_properties(Unit *u); diff --git a/src/core/dbus-target.c b/src/core/dbus-target.c new file mode 100644 index 0000000..e979fb7 --- /dev/null +++ b/src/core/dbus-target.c @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-target.h" +#include "unit.h" + +const sd_bus_vtable bus_target_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_VTABLE_END +}; diff --git a/src/core/dbus-target.h b/src/core/dbus-target.h new file mode 100644 index 0000000..fedd4a9 --- /dev/null +++ b/src/core/dbus-target.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus-vtable.h" + +extern const sd_bus_vtable bus_target_vtable[]; diff --git a/src/core/dbus-timer.c b/src/core/dbus-timer.c new file mode 100644 index 0000000..4f78a52 --- /dev/null +++ b/src/core/dbus-timer.c @@ -0,0 +1,364 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "dbus-timer.h" +#include "dbus-util.h" +#include "strv.h" +#include "timer.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, timer_result, TimerResult); + +static int property_get_monotonic_timers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(stt)"); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + _cleanup_free_ char *usec = NULL; + + if (v->base == TIMER_CALENDAR) + continue; + + usec = timer_base_to_usec_string(v->base); + if (!usec) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(stt)", usec, v->value, v->next_elapse); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_calendar_timers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sst)"); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + _cleanup_free_ char *buf = NULL; + + if (v->base != TIMER_CALENDAR) + continue; + + r = calendar_spec_to_string(v->calendar_spec, &buf); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "(sst)", timer_base_to_string(v->base), buf, v->next_elapse); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_next_elapse_monotonic( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", timer_next_elapse_monotonic(t)); +} + +const sd_bus_vtable bus_timer_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimersMonotonic", "a(stt)", property_get_monotonic_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("TimersCalendar", "a(sst)", property_get_calendar_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("OnClockChange", "b", bus_property_get_bool, offsetof(Timer, on_clock_change), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnTimezoneChange", "b", bus_property_get_bool, offsetof(Timer, on_timezone_change), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NextElapseUSecRealtime", "t", bus_property_get_usec, offsetof(Timer, next_elapse_realtime), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NextElapseUSecMonotonic", "t", property_get_next_elapse_monotonic, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("LastTriggerUSec", offsetof(Timer, last_trigger), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Timer, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AccuracyUSec", "t", bus_property_get_usec, offsetof(Timer, accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Timer, random_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FixedRandomDelay", "b", bus_property_get_bool, offsetof(Timer, fixed_random_delay), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Persistent", "b", bus_property_get_bool, offsetof(Timer, persistent), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WakeSystem", "b", bus_property_get_bool, offsetof(Timer, wake_system), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemainAfterElapse", "b", bus_property_get_bool, offsetof(Timer, remain_after_elapse), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int timer_add_one_monotonic_spec( + Timer *t, + const char *name, + TimerBase base, + UnitWriteFlags flags, + usec_t usec, + sd_bus_error *error) { + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + TimerValue *v; + + unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", + timer_base_to_string(base), + FORMAT_TIMESPAN(usec, USEC_PER_MSEC)); + + v = new(TimerValue, 1); + if (!v) + return -ENOMEM; + + *v = (TimerValue) { + .base = base, + .value = usec, + }; + + LIST_PREPEND(value, t->values, v); + } + + return 1; +} + +static int timer_add_one_calendar_spec( + Timer *t, + const char *name, + TimerBase base, + UnitWriteFlags flags, + const char *str, + sd_bus_error *error) { + + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + int r; + + r = calendar_spec_from_string(str, &c); + if (r == -EINVAL) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec"); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", timer_base_to_string(base), str); + + TimerValue *v = new(TimerValue, 1); + if (!v) + return -ENOMEM; + + *v = (TimerValue) { + .base = base, + .calendar_spec = TAKE_PTR(c), + }; + + LIST_PREPEND(value, t->values, v); + } + + return 1; +}; + +static int bus_timer_set_transient_property( + Timer *t, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(t); + int r; + + assert(t); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "AccuracyUSec")) + return bus_set_transient_usec(u, name, &t->accuracy_usec, message, flags, error); + + if (streq(name, "AccuracySec")) { + log_notice("Client is using obsolete AccuracySec= transient property, please use AccuracyUSec= instead."); + return bus_set_transient_usec(u, "AccuracyUSec", &t->accuracy_usec, message, flags, error); + } + + if (streq(name, "RandomizedDelayUSec")) + return bus_set_transient_usec(u, name, &t->random_usec, message, flags, error); + + if (streq(name, "FixedRandomDelay")) + return bus_set_transient_bool(u, name, &t->fixed_random_delay, message, flags, error); + + if (streq(name, "WakeSystem")) + return bus_set_transient_bool(u, name, &t->wake_system, message, flags, error); + + if (streq(name, "Persistent")) + return bus_set_transient_bool(u, name, &t->persistent, message, flags, error); + + if (streq(name, "RemainAfterElapse")) + return bus_set_transient_bool(u, name, &t->remain_after_elapse, message, flags, error); + + if (streq(name, "OnTimezoneChange")) + return bus_set_transient_bool(u, name, &t->on_timezone_change, message, flags, error); + + if (streq(name, "OnClockChange")) + return bus_set_transient_bool(u, name, &t->on_clock_change, message, flags, error); + + if (streq(name, "TimersMonotonic")) { + const char *base_name; + usec_t usec; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &base_name, &usec)) > 0) { + TimerBase b; + + b = timer_base_from_string(base_name); + if (b < 0 || b == TIMER_CALENDAR) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid timer base: %s", base_name); + + r = timer_add_one_monotonic_spec(t, name, b, flags, usec, error); + if (r < 0) + return r; + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + timer_free_values(t); + unit_write_setting(u, flags, name, "OnActiveSec="); + } + + return 1; + + } else if (streq(name, "TimersCalendar")) { + const char *base_name, *str; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &base_name, &str)) > 0) { + TimerBase b; + + b = timer_base_from_string(base_name); + if (b != TIMER_CALENDAR) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid timer base: %s", base_name); + + r = timer_add_one_calendar_spec(t, name, b, flags, str, error); + if (r < 0) + return r; + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + timer_free_values(t); + unit_write_setting(u, flags, name, "OnCalendar="); + } + + return 1; + + } else if (STR_IN_SET(name, + "OnActiveSec", + "OnBootSec", + "OnStartupSec", + "OnUnitActiveSec", + "OnUnitInactiveSec")) { + + TimerBase b; + usec_t usec; + + log_notice("Client is using obsolete %s= transient property, please use TimersMonotonic= instead.", name); + + b = timer_base_from_string(name); + if (b < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown timer base %s", name); + + r = sd_bus_message_read(message, "t", &usec); + if (r < 0) + return r; + + return timer_add_one_monotonic_spec(t, name, b, flags, usec, error); + + } else if (streq(name, "OnCalendar")) { + + const char *str; + + log_notice("Client is using obsolete %s= transient property, please use TimersCalendar= instead.", name); + + r = sd_bus_message_read(message, "s", &str); + if (r < 0) + return r; + + return timer_add_one_calendar_spec(t, name, TIMER_CALENDAR, flags, str, error); + } + + return 0; +} + +int bus_timer_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags mode, + sd_bus_error *error) { + + Timer *t = TIMER(u); + + assert(t); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) + return bus_timer_set_transient_property(t, name, message, mode, error); + + return 0; +} diff --git a/src/core/dbus-timer.h b/src/core/dbus-timer.h new file mode 100644 index 0000000..ac436f1 --- /dev/null +++ b/src/core/dbus-timer.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_timer_vtable[]; + +int bus_timer_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c new file mode 100644 index 0000000..1a037b7 --- /dev/null +++ b/src/core/dbus-unit.c @@ -0,0 +1,2629 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "condition.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "fd-util.h" +#include "install.h" +#include "locale-util.h" +#include "log.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-access.h" +#include "service.h" +#include "signal-util.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" +#include "web-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_collect_mode, collect_mode, CollectMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_load_state, unit_load_state, UnitLoadState); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_job_mode, job_mode, JobMode); +static BUS_DEFINE_PROPERTY_GET(property_get_description, "s", Unit, unit_description); +static BUS_DEFINE_PROPERTY_GET2(property_get_active_state, "s", Unit, unit_active_state, unit_active_state_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_freezer_state, "s", Unit, unit_freezer_state, freezer_state_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_sub_state, "s", Unit, unit_sub_state_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_unit_file_state, "s", Unit, unit_get_unit_file_state, unit_file_state_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_can_reload, "b", Unit, unit_can_reload); +static BUS_DEFINE_PROPERTY_GET(property_get_can_start, "b", Unit, unit_can_start_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_stop, "b", Unit, unit_can_stop_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_isolate, "b", Unit, unit_can_isolate_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_freeze, "b", Unit, unit_can_freeze); +static BUS_DEFINE_PROPERTY_GET(property_get_need_daemon_reload, "b", Unit, unit_need_daemon_reload); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_strv, "as", 0); + +static int property_get_can_clean( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + ExecCleanMask mask; + int r; + + assert(bus); + assert(reply); + + r = unit_can_clean(u, &mask); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!FLAGS_SET(mask, 1U << t)) + continue; + + r = sd_bus_message_append(reply, "s", exec_resource_type_to_string(t)); + if (r < 0) + return r; + } + + if (FLAGS_SET(mask, EXEC_CLEAN_FDSTORE)) { + r = sd_bus_message_append(reply, "s", "fdstore"); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_names( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + const char *t; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", u->id); + if (r < 0) + return r; + + SET_FOREACH(t, u->aliases) { + r = sd_bus_message_append(reply, "s", t); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_following( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *f; + + assert(bus); + assert(reply); + assert(u); + + f = unit_following(u); + return sd_bus_message_append(reply, "s", f ? f->id : NULL); +} + +static int property_get_dependencies( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *other; + UnitDependency d; + Hashmap *deps; + void *v; + int r; + + assert(bus); + assert(reply); + assert(u); + + d = unit_dependency_from_string(property); + assert_se(d >= 0); + + deps = unit_get_dependencies(u, d); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(v, other, deps) { + r = sd_bus_message_append(reply, "s", other->id); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_requires_mounts_for( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Hashmap **h = ASSERT_PTR(userdata); + const char *p; + void *v; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(v, p, *h) { + r = sd_bus_message_append(reply, "s", p); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_unit_file_preset( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = unit_get_unit_file_preset(u); + + return sd_bus_message_append(reply, "s", preset_action_past_tense_to_string(r)); +} + +static int property_get_job( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Job **j = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + if (!*j) + return sd_bus_message_append(reply, "(uo)", 0, "/"); + + p = job_dbus_path(*j); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(uo)", (*j)->id, p); +} + +static int property_get_conditions( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const char *(*to_string)(ConditionType type) = NULL; + Condition **list = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + to_string = streq(property, "Asserts") ? assert_type_to_string : condition_type_to_string; + + r = sd_bus_message_open_container(reply, 'a', "(sbbsi)"); + if (r < 0) + return r; + + LIST_FOREACH(conditions, c, *list) { + int tristate; + + tristate = + c->result == CONDITION_UNTESTED ? 0 : + c->result == CONDITION_SUCCEEDED ? 1 : -1; + + r = sd_bus_message_append(reply, "(sbbsi)", + to_string(c->type), + c->trigger, c->negate, + c->parameter, tristate); + if (r < 0) + return r; + + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_load_error( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = bus_unit_validate_load_state(u, &e); + if (r < 0) + return sd_bus_message_append(reply, "(ss)", e.name, e.message); + + return sd_bus_message_append(reply, "(ss)", NULL, NULL); +} + +static int property_get_markers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + unsigned *markers = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + /* Make sure out values fit in the bitfield. */ + assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8); + + for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++) + if (FLAGS_SET(*markers, 1u << m)) { + r = sd_bus_message_append(reply, "s", unit_marker_to_string(m)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = { + [JOB_START] = N_("Authentication is required to start '$(unit)'."), + [JOB_STOP] = N_("Authentication is required to stop '$(unit)'."), + [JOB_RELOAD] = N_("Authentication is required to reload '$(unit)'."), + [JOB_RESTART] = N_("Authentication is required to restart '$(unit)'."), + [JOB_TRY_RESTART] = N_("Authentication is required to restart '$(unit)'."), +}; + +int bus_unit_method_start_generic( + sd_bus_message *message, + Unit *u, + JobType job_type, + bool reload_if_possible, + sd_bus_error *error) { + + BusUnitQueueFlags job_flags = reload_if_possible ? BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE : 0; + const char *smode, *verb; + JobMode mode; + int r; + + assert(message); + assert(u); + assert(job_type >= 0 && job_type < _JOB_TYPE_MAX); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(job_type), + error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &smode); + if (r < 0) + return r; + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode); + + if (reload_if_possible) + verb = strjoina("reload-or-", job_type_to_string(job_type)); + else + verb = job_type_to_string(job_type); + + if (sd_bus_message_is_method_call(message, NULL, "StartUnitWithFlags")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read(message, "t", &input_flags); + if (r < 0) + return r; + /* Let clients know that this version doesn't support any flags at the moment. */ + if (input_flags != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + } + + r = bus_verify_manage_units_async_full( + u, + verb, + CAP_SYS_ADMIN, + polkit_message_for_job[job_type], + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + return bus_unit_queue_job(message, u, job_type, mode, job_flags, error); +} + +static int bus_unit_method_start(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_START, false, error); +} + +static int bus_unit_method_stop(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_STOP, false, error); +} + +static int bus_unit_method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RELOAD, false, error); +} + +static int bus_unit_method_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RESTART, false, error); +} + +static int bus_unit_method_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, false, error); +} + +static int bus_unit_method_reload_or_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RESTART, true, error); +} + +static int bus_unit_method_reload_or_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, true, error); +} + +int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + BusUnitQueueFlags flags = BUS_UNIT_QUEUE_VERBOSE_REPLY; + const char *jtype, *smode; + Unit *u = ASSERT_PTR(userdata); + JobType type; + JobMode mode; + int r; + + assert(message); + + r = sd_bus_message_read(message, "ss", &jtype, &smode); + if (r < 0) + return r; + + /* Parse the two magic reload types "reload-or-…" manually */ + if (streq(jtype, "reload-or-restart")) { + type = JOB_RESTART; + flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE; + } else if (streq(jtype, "reload-or-try-restart")) { + type = JOB_TRY_RESTART; + flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE; + } else { + /* And the rest generically */ + type = job_type_from_string(jtype); + if (type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job type %s invalid", jtype); + } + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(type), + error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + jtype, + CAP_SYS_ADMIN, + polkit_message_for_job[type], + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + return bus_unit_queue_job(message, u, type, mode, flags, error); +} + +int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = ASSERT_PTR(userdata); + int32_t value = 0; + const char *swho; + int32_t signo; + KillWho who; + int r, code; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "stop", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "si", &swho, &signo); + if (r < 0) + return r; + + if (startswith(sd_bus_message_get_member(message), "QueueSignal")) { + r = sd_bus_message_read(message, "i", &value); + if (r < 0) + return r; + + code = SI_QUEUE; + } else + code = SI_USER; + + if (isempty(swho)) + who = KILL_ALL; + else { + who = kill_who_from_string(swho); + if (who < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid who argument: %s", swho); + } + + if (!SIGNAL_VALID(signo)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Signal number out of range."); + + if (code == SI_QUEUE && !((signo >= SIGRTMIN) && (signo <= SIGRTMAX))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Value parameter only accepted for realtime signals (SIGRTMIN…SIGRTMAX), refusing for signal SIG%s.", signal_to_string(signo)); + + r = bus_verify_manage_units_async_full( + u, + "kill", + CAP_KILL, + N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_kill(u, who, signo, code, value, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "reset-failed", + CAP_SYS_ADMIN, + N_("Authentication is required to reset the \"failed\" state of '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + unit_reset_failed(u); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = ASSERT_PTR(userdata); + int runtime, r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "set-property", + CAP_SYS_ADMIN, + N_("Authentication is required to set properties on '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = bus_unit_set_properties(u, message, runtime ? UNIT_RUNTIME : UNIT_PERSISTENT, true, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "ref", + CAP_SYS_ADMIN, + NULL, + false, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = bus_unit_track_add_sender(u, message); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_unit_track_remove_sender(u, message); + if (r == -EUNATCH) + return sd_bus_error_set(error, BUS_ERROR_NOT_REFERENCED, "Unit has not been referenced yet."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error) { + ExecCleanMask mask = 0; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "stop", error); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + ExecCleanMask m; + const char *i; + + r = sd_bus_message_read(message, "s", &i); + if (r < 0) + return r; + if (r == 0) + break; + + m = exec_clean_mask_from_string(i); + if (m < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid resource type: %s", i); + + mask |= m; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "clean", + CAP_DAC_OVERRIDE, + N_("Authentication is required to delete files and directories associated with '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_clean(u, mask); + if (r == -EOPNOTSUPP) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support cleaning.", u->id); + if (r == -EUNATCH) + return sd_bus_error_set(error, BUS_ERROR_NOTHING_TO_CLEAN, "No matching resources found."); + if (r == -EBUSY) + return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit is not inactive or has pending job."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) { + const char* perm; + int (*method)(Unit*); + Unit *u = ASSERT_PTR(userdata); + bool reply_no_delay = false; + int r; + + assert(message); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + if (action == FREEZER_FREEZE) { + perm = "stop"; + method = unit_freeze; + } else { + perm = "start"; + method = unit_thaw; + } + + r = mac_selinux_unit_access_check(u, message, perm, error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + perm, + CAP_SYS_ADMIN, + N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = method(u); + if (r == -EOPNOTSUPP) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id); + if (r == -EBUSY) + return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job."); + if (r == -EHOSTDOWN) + return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive."); + if (r == -EALREADY) + return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id); + if (r < 0) + return r; + if (r == 0) + reply_no_delay = true; + + if (u->pending_freezer_invocation) { + bus_unit_send_pending_freezer_message(u, true); + assert(!u->pending_freezer_invocation); + } + + u->pending_freezer_invocation = sd_bus_message_ref(message); + + if (reply_no_delay) { + r = bus_unit_send_pending_freezer_message(u, false); + if (r < 0) + return r; + } + + return 1; +} + +int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_THAW); +} + +int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_FREEZE); +} + +static int property_get_refs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (const char *i = sd_bus_track_first(u->bus_track); i; i = sd_bus_track_next(u->bus_track)) { + int c; + + c = sd_bus_track_count_name(u->bus_track, i); + if (c < 0) + return c; + + /* Add the item multiple times if the ref count for each is above 1 */ + for (int k = 0; k < c; k++) { + r = sd_bus_message_append(reply, "s", i); + if (r < 0) + return r; + } + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_unit_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Unit, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Names", "as", property_get_names, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Following", "s", property_get_following, 0, 0), + SD_BUS_PROPERTY("Requires", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Requisite", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Wants", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindsTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PartOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Upholds", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequisiteOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WantedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BoundBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UpheldBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConsistsOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Conflicts", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConflictedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Before", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("After", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnSuccess", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnSuccessOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailure", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailureOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Triggers", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggeredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PropagatesReloadTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReloadPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PropagatesStopTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StopPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SliceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AccessSELinuxContext", "s", NULL, offsetof(Unit, access_selinux_context), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LoadState", "s", property_get_load_state, offsetof(Unit, load_state), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ActiveState", "s", property_get_active_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FreezerState", "s", property_get_freezer_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("SubState", "s", property_get_sub_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FragmentPath", "s", NULL, offsetof(Unit, fragment_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Unit, source_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DropInPaths", "as", NULL, offsetof(Unit, dropin_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UnitFileState", "s", property_get_unit_file_state, 0, 0), + SD_BUS_PROPERTY("UnitFilePreset", "s", property_get_unit_file_preset, 0, 0), + BUS_PROPERTY_DUAL_TIMESTAMP("StateChangeTimestamp", offsetof(Unit, state_change_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("InactiveExitTimestamp", offsetof(Unit, inactive_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ActiveEnterTimestamp", offsetof(Unit, active_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ActiveExitTimestamp", offsetof(Unit, active_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("InactiveEnterTimestamp", offsetof(Unit, inactive_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CanStart", "b", property_get_can_start, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanStop", "b", property_get_can_stop, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanReload", "b", property_get_can_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanIsolate", "b", property_get_can_isolate, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanClean", "as", property_get_can_clean, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanFreeze", "b", property_get_can_freeze, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Job", "(uo)", property_get_job, offsetof(Unit, job), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("StopWhenUnneeded", "b", bus_property_get_bool, offsetof(Unit, stop_when_unneeded), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RefuseManualStart", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_start), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */ + SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IgnoreOnIsolate", "b", bus_property_get_bool, offsetof(Unit, ignore_on_isolate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NeedDaemonReload", "b", property_get_need_daemon_reload, 0, 0), + SD_BUS_PROPERTY("Markers", "as", property_get_markers, offsetof(Unit, markers), 0), + SD_BUS_PROPERTY("JobTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_timeout), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobRunningTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_running_timeout), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutAction", "s", bus_property_get_emergency_action, offsetof(Unit, job_timeout_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutRebootArgument", "s", NULL, offsetof(Unit, job_timeout_reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConditionResult", "b", bus_property_get_bool, offsetof(Unit, condition_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AssertResult", "b", bus_property_get_bool, offsetof(Unit, assert_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ConditionTimestamp", offsetof(Unit, condition_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("AssertTimestamp", offsetof(Unit, assert_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Conditions", "a(sbbsi)", property_get_conditions, offsetof(Unit, conditions), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("Asserts", "a(sbbsi)", property_get_conditions, offsetof(Unit, asserts), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("LoadError", "(ss)", property_get_load_error, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Transient", "b", bus_property_get_bool, offsetof(Unit, transient), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Perpetual", "b", bus_property_get_bool, offsetof(Unit, perpetual), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitAction", "s", bus_property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FailureAction", "s", bus_property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FailureActionExitStatus", "i", bus_property_get_int, offsetof(Unit, failure_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessAction", "s", bus_property_get_emergency_action, offsetof(Unit, success_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessActionExitStatus", "i", bus_property_get_int, offsetof(Unit, success_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0), + SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Unit, activation_details), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_METHOD_WITH_ARGS("Start", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_start, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Stop", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_stop, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Reload", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_reload, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Restart", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TryRestart", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_try_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReloadOrRestart", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_reload_or_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestart", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("o", job), + bus_unit_method_reload_or_try_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("EnqueueJob", + SD_BUS_ARGS("s", job_type, "s", job_mode), + SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs), + bus_unit_method_enqueue_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Kill", + SD_BUS_ARGS("s", whom, "i", signal), + SD_BUS_NO_RESULT, + bus_unit_method_kill, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("QueueSignal", + SD_BUS_ARGS("s", whom, "i", signal, "i", value), + SD_BUS_NO_RESULT, + bus_unit_method_kill, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailed", + NULL, + NULL, + bus_unit_method_reset_failed, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetProperties", + SD_BUS_ARGS("b", runtime, "a(sv)", properties), + SD_BUS_NO_RESULT, + bus_unit_method_set_properties, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Ref", + NULL, + NULL, + bus_unit_method_ref, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unref", + NULL, + NULL, + bus_unit_method_unref, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Clean", + SD_BUS_ARGS("as", mask), + SD_BUS_NO_RESULT, + bus_unit_method_clean, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Freeze", + NULL, + NULL, + bus_unit_method_freeze, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Thaw", + NULL, + NULL, + bus_unit_method_thaw, + SD_BUS_VTABLE_UNPRIVILEGED), + + /* For dependency types we don't support anymore always return an empty array */ + SD_BUS_PROPERTY("RequiresOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequisiteOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequiredByOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequisiteOfOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + /* Obsolete alias names */ + SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + + SD_BUS_VTABLE_END +}; + +static int property_get_slice( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "s", unit_slice_name(u)); +} + +static int property_get_current_memory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t sz = UINT64_MAX; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = unit_get_memory_current(u, &sz); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get current memory usage from cgroup: %m"); + + return sd_bus_message_append(reply, "t", sz); +} + +static int property_get_available_memory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t sz = UINT64_MAX; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = unit_get_memory_available(u, &sz); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get total available memory from cgroup: %m"); + + return sd_bus_message_append(reply, "t", sz); +} + +static int property_get_memory_accounting( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + CGroupMemoryAccountingMetric metric; + uint64_t sz = UINT64_MAX; + + assert(bus); + assert(reply); + + assert_se((metric = cgroup_memory_accounting_metric_from_string(property)) >= 0); + (void) unit_get_memory_accounting(u, metric, &sz); + return sd_bus_message_append(reply, "t", sz); +} + +static int property_get_current_tasks( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t cn = UINT64_MAX; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = unit_get_tasks_current(u, &cn); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get pids.current attribute: %m"); + + return sd_bus_message_append(reply, "t", cn); +} + +static int property_get_cpu_usage( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + nsec_t ns = NSEC_INFINITY; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = unit_get_cpu_usage(u, &ns); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get cpuacct.usage attribute: %m"); + + return sd_bus_message_append(reply, "t", ns); +} + +static int property_get_cpuset_cpus( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + _cleanup_(cpu_set_reset) CPUSet cpus = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + + (void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective"); + (void) cpu_set_to_dbus(&cpus, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_cpuset_mems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + _cleanup_(cpu_set_reset) CPUSet mems = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + + (void) unit_get_cpuset(u, &mems, "cpuset.mems.effective"); + (void) cpu_set_to_dbus(&mems, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_cgroup( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + const char *t = NULL; + + assert(bus); + assert(reply); + + /* Three cases: a) u->cgroup_path is NULL, in which case the + * unit has no control group, which we report as the empty + * string. b) u->cgroup_path is the empty string, which + * indicates the root cgroup, which we report as "/". c) all + * other cases we report as-is. */ + + if (u->cgroup_path) + t = empty_to_root(u->cgroup_path); + + return sd_bus_message_append(reply, "s", t); +} + +static int append_process(sd_bus_message *reply, const char *p, PidRef *pid, Set *pids) { + _cleanup_free_ char *buf = NULL, *cmdline = NULL; + int r; + + assert(reply); + assert(pidref_is_set(pid)); + + r = set_put(pids, PID_TO_PTR(pid->pid)); + if (IN_SET(r, 0, -EEXIST)) + return 0; + if (r < 0) + return r; + + if (!p) { + r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &buf); + if (r == -ESRCH) + return 0; + if (r < 0) + return r; + + p = buf; + } + + (void) pidref_get_cmdline( + pid, + SIZE_MAX, + PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_QUOTE, + &cmdline); + + return sd_bus_message_append(reply, + "(sus)", + p, + (uint32_t) pid->pid, + cmdline); +} + +static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(reply); + assert(p); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, p, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + + /* libvirt / qemu uses threaded mode and cgroup.procs cannot be read at the lower levels. + * From https://docs.kernel.org/admin-guide/cgroup-v2.html#threads, “cgroup.procs” in a + * threaded domain cgroup contains the PIDs of all processes in the subtree and is not + * readable in the subtree proper. */ + + r = cg_read_pidref(f, &pidref); + if (IN_SET(r, 0, -EOPNOTSUPP)) + break; + if (r < 0) + return r; + + r = pidref_is_kernel_thread(&pidref); + if (r == -ESRCH) /* gone by now */ + continue; + if (r < 0) + log_debug_errno(r, "Failed to determine if " PID_FMT " is a kernel thread, assuming not: %m", pidref.pid); + if (r > 0) + continue; + + r = append_process(reply, p, &pidref, pids); + if (r < 0) + return r; + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, p, &d); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *g = NULL, *j = NULL; + + r = cg_read_subgroup(d, &g); + if (r < 0) + return r; + if (r == 0) + break; + + j = path_join(empty_to_root(p), g); + if (!j) + return -ENOMEM; + + r = append_cgroup(reply, j, pids); + if (r < 0) + return r; + } + + return 0; +} + +int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_set_free_ Set *pids = NULL; + Unit *u = userdata; + int r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + pids = set_new(NULL); + if (!pids) + return -ENOMEM; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(sus)"); + if (r < 0) + return r; + + if (u->cgroup_path) { + r = append_cgroup(reply, u->cgroup_path, pids); + if (r < 0) + return r; + } + + /* The main and control pids might live outside of the cgroup, hence fetch them separately */ + PidRef *pid = unit_main_pid(u); + if (pidref_is_set(pid)) { + r = append_process(reply, NULL, pid, pids); + if (r < 0) + return r; + } + + pid = unit_control_pid(u); + if (pidref_is_set(pid)) { + r = append_process(reply, NULL, pid, pids); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int property_get_ip_counter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t value = UINT64_MAX; + Unit *u = ASSERT_PTR(userdata); + CGroupIPAccountingMetric metric; + + assert(bus); + assert(reply); + assert(property); + + assert_se((metric = cgroup_ip_accounting_metric_from_string(property)) >= 0); + (void) unit_get_ip_accounting(u, metric, &value); + return sd_bus_message_append(reply, "t", value); +} + +static int property_get_io_counter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t value = UINT64_MAX; + Unit *u = ASSERT_PTR(userdata); + ssize_t metric; + + assert(bus); + assert(reply); + assert(property); + + assert_se((metric = cgroup_io_accounting_metric_from_string(property)) >= 0); + (void) unit_get_io_accounting(u, metric, /* allow_cache= */ false, &value); + return sd_bus_message_append(reply, "t", value); +} + +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_set_free_ Set *pids = NULL; + Unit *u = userdata; + const char *path; + int r; + + assert(message); + + /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a + * specified cgroup path. Obviously this only works for units that actually maintain a cgroup + * representation. If a process is already in the cgroup no operation is executed – in this case the specified + * subcgroup path has no effect! */ + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + path = empty_to_null(path); + if (path) { + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path); + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path); + } + + if (!unit_cgroup_delegate(u)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units."); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing."); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + for (;;) { + _cleanup_(pidref_freep) PidRef *pidref = NULL; + uid_t process_uid, sender_uid; + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + r = pidref_new_from_pid(pid, &pidref); + if (r < 0) + return r; + + /* Filter out duplicates */ + if (set_contains(pids, pidref)) + continue; + + /* Check if this process is suitable for attaching to this unit */ + r = unit_pid_attachable(u, pidref, error); + if (r < 0) + return r; + + /* Let's query the sender's UID, so that we can make our security decisions */ + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + return r; + + /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit, + * then the process' UID and the target unit's UID have to match the sender's UID */ + if (sender_uid != 0 && sender_uid != getuid()) { + r = pidref_get_uid(pidref, &process_uid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m"); + + if (process_uid != sender_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid); + if (process_uid != u->ref_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid); + } + + r = set_ensure_consume(&pids, &pidref_hash_ops_free, TAKE_PTR(pidref)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = unit_attach_pids_to_cgroup(u, pids, path); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m"); + + return sd_bus_reply_method_return(message, NULL); +} + +const sd_bus_vtable bus_unit_cgroup_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), + SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0), + SD_BUS_PROPERTY("ControlGroupId", "t", NULL, offsetof(Unit, cgroup_id), 0), + SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0), + SD_BUS_PROPERTY("MemoryPeak", "t", property_get_memory_accounting, 0, 0), + SD_BUS_PROPERTY("MemorySwapCurrent", "t", property_get_memory_accounting, 0, 0), + SD_BUS_PROPERTY("MemorySwapPeak", "t", property_get_memory_accounting, 0, 0), + SD_BUS_PROPERTY("MemoryZSwapCurrent", "t", property_get_memory_accounting, 0, 0), + SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0), + SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), + SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0), + SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), + SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IOReadBytes", "t", property_get_io_counter, 0, 0), + SD_BUS_PROPERTY("IOReadOperations", "t", property_get_io_counter, 0, 0), + SD_BUS_PROPERTY("IOWriteBytes", "t", property_get_io_counter, 0, 0), + SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0), + + SD_BUS_METHOD_WITH_ARGS("GetProcesses", + SD_BUS_NO_ARGS, + SD_BUS_ARGS("a(sus)", processes), + bus_unit_method_get_processes, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_METHOD_WITH_ARGS("AttachProcesses", + SD_BUS_ARGS("s", subcgroup, "au", pids), + SD_BUS_NO_RESULT, + bus_unit_method_attach_processes, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +static int send_new_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitNew"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "so", u->id, p); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + _cleanup_free_ char *p = NULL; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + /* Send a properties changed signal. First for the specific + * type, then for the generic unit. The clients may rely on + * this order to get atomic behavior if needed. */ + + r = sd_bus_emit_properties_changed_strv( + bus, p, + unit_dbus_interface_from_type(u->type), + NULL); + if (r < 0) + return r; + + return sd_bus_emit_properties_changed_strv( + bus, p, + "org.freedesktop.systemd1.Unit", + NULL); +} + +void bus_unit_send_change_signal(Unit *u) { + int r; + assert(u); + + if (u->in_dbus_queue) { + LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u); + u->in_dbus_queue = false; + + /* The unit might be good to be GC once its pending signals have been sent */ + unit_add_to_gc_queue(u); + } + + if (!u->id) + return; + + r = bus_foreach_bus(u->manager, u->bus_track, u->sent_dbus_new_signal ? send_changed_signal : send_new_signal, u); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to send unit change signal for %s: %m", u->id); + + u->sent_dbus_new_signal = true; +} + +void bus_unit_send_pending_change_signal(Unit *u, bool including_new) { + + /* Sends out any pending change signals, but only if they really are pending. This call is used when we are + * about to change state in order to force out a PropertiesChanged signal beforehand if there was one pending + * so that clients can follow the full state transition */ + + if (!u->in_dbus_queue) /* If not enqueued, don't bother */ + return; + + if (!u->sent_dbus_new_signal && !including_new) /* If the unit was never announced, don't bother, it's fine if + * the unit appears in the new state right-away (except if the + * caller explicitly asked us to send it anyway) */ + return; + + if (MANAGER_IS_RELOADING(u->manager)) /* Don't generate unnecessary PropertiesChanged signals for the same unit + * when we are reloading. */ + return; + + bus_unit_send_change_signal(u); +} + +int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(u); + + if (!u->pending_freezer_invocation) + return 0; + + if (cancelled) + r = sd_bus_message_new_method_error( + u->pending_freezer_invocation, + &reply, + &SD_BUS_ERROR_MAKE_CONST( + BUS_ERROR_FREEZE_CANCELLED, "Freeze operation aborted")); + else + r = sd_bus_message_new_method_return(u->pending_freezer_invocation, &reply); + if (r < 0) + return r; + + r = sd_bus_send(NULL, reply, NULL); + if (r < 0) + log_warning_errno(r, "Failed to send queued message, ignoring: %m"); + + u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation); + + return 0; +} + +static int send_removed_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "so", u->id, p); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +void bus_unit_send_removed_signal(Unit *u) { + int r; + assert(u); + + if (!u->sent_dbus_new_signal || u->in_dbus_queue) + bus_unit_send_change_signal(u); + + if (!u->id) + return; + + r = bus_foreach_bus(u->manager, u->bus_track, send_removed_signal, u); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to send unit remove signal for %s: %m", u->id); +} + +int bus_unit_queue_job_one( + sd_bus_message *message, + Unit *u, + JobType type, + JobMode mode, + BusUnitQueueFlags flags, + sd_bus_message *reply, + sd_bus_error *error) { + + _cleanup_set_free_ Set *affected = NULL; + _cleanup_free_ char *job_path = NULL, *unit_path = NULL; + Job *j, *a; + int r; + + if (FLAGS_SET(flags, BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE) && unit_can_reload(u)) { + if (type == JOB_RESTART) + type = JOB_RELOAD_OR_START; + else if (type == JOB_TRY_RESTART) + type = JOB_TRY_RELOAD; + } + + if (type == JOB_STOP && + IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_ERROR, UNIT_BAD_SETTING) && + unit_active_state(u) == UNIT_INACTIVE) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", u->id); + + if ((type == JOB_START && u->refuse_manual_start) || + (type == JOB_STOP && u->refuse_manual_stop) || + (IN_SET(type, JOB_RESTART, JOB_TRY_RESTART) && (u->refuse_manual_start || u->refuse_manual_stop)) || + (type == JOB_RELOAD_OR_START && job_type_collapse(type, u) == JOB_START && u->refuse_manual_start)) + return sd_bus_error_setf(error, + BUS_ERROR_ONLY_BY_DEPENDENCY, + "Operation refused, unit %s may be requested by dependency only (it is configured to refuse manual start/stop).", + u->id); + + /* dbus-broker issues StartUnit for activation requests, and Type=dbus services automatically + * gain dependency on dbus.socket. Therefore, if dbus has a pending stop job, the new start + * job that pulls in dbus again would cause job type conflict. Let's avoid that by rejecting + * job enqueuing early. + * + * Note that unlike signal_activation_request(), we can't use unit_inactive_or_pending() + * here. StartUnit is a more generic interface, and thus users are allowed to use e.g. systemctl + * to start Type=dbus services even when dbus is inactive. */ + if (type == JOB_START && u->type == UNIT_SERVICE && SERVICE(u)->type == SERVICE_DBUS) + FOREACH_STRING(dbus_unit, SPECIAL_DBUS_SOCKET, SPECIAL_DBUS_SERVICE) { + Unit *dbus; + + dbus = manager_get_unit(u->manager, dbus_unit); + if (dbus && unit_stop_pending(dbus)) + return sd_bus_error_setf(error, + BUS_ERROR_SHUTTING_DOWN, + "Operation for unit %s refused, D-Bus is shutting down.", + u->id); + } + + if (FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) { + affected = set_new(NULL); + if (!affected) + return -ENOMEM; + } + + r = manager_add_job(u->manager, type, u, mode, affected, error, &j); + if (r < 0) + return r; + + r = bus_job_track_sender(j, message); + if (r < 0) + return r; + + /* Before we send the method reply, force out the announcement JobNew for this job */ + bus_job_send_pending_change_signal(j, true); + + job_path = job_dbus_path(j); + if (!job_path) + return -ENOMEM; + + /* The classic response is just a job object path */ + if (!FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) + return sd_bus_message_append(reply, "o", job_path); + + /* In verbose mode respond with the anchor job plus everything that has been affected */ + + unit_path = unit_dbus_path(j->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "uosos", + j->id, job_path, + j->unit->id, unit_path, + job_type_to_string(j->type)); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(uosos)"); + if (r < 0) + return r; + + SET_FOREACH(a, affected) { + if (a->id == j->id) + continue; + + /* Free paths from previous iteration */ + job_path = mfree(job_path); + unit_path = mfree(unit_path); + + job_path = job_dbus_path(a); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(a->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(uosos)", + a->id, job_path, + a->unit->id, unit_path, + job_type_to_string(a->type)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +int bus_unit_queue_job( + sd_bus_message *message, + Unit *u, + JobType type, + JobMode mode, + BusUnitQueueFlags flags, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(message); + assert(u); + assert(type >= 0 && type < _JOB_TYPE_MAX); + assert(mode >= 0 && mode < _JOB_MODE_MAX); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(type), + error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = bus_unit_queue_job_one(message, u, type, mode, flags, reply, error); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int bus_unit_set_live_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(u); + assert(name); + assert(message); + + /* Handles setting properties both "live" (i.e. at any time during runtime), and during creation (for + * transient units that are being created). */ + + if (streq(name, "Description")) { + const char *d; + + r = sd_bus_message_read(message, "s", &d); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_set_description(u, d); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Description=%s", d); + } + + return 1; + } + + /* A setting that only applies to active units. We don't actually write this to /run, this state is + * managed internally. "+foo" sets flag foo, "-foo" unsets flag foo, just "foo" resets flags to + * foo. The last type cannot be mixed with "+" or "-". */ + + if (streq(name, "Markers")) { + unsigned settings = 0, mask = 0; + bool some_plus_minus = false, some_absolute = false; + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *word; + bool b; + + r = sd_bus_message_read(message, "s", &word); + if (r < 0) + return r; + if (r == 0) + break; + + if (IN_SET(word[0], '+', '-')) { + b = word[0] == '+'; + word++; + some_plus_minus = true; + } else { + b = true; + some_absolute = true; + } + + UnitMarker m = unit_marker_from_string(word); + if (m < 0) + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "Unknown marker \"%s\".", word); + + SET_FLAG(settings, 1u << m, b); + SET_FLAG(mask, 1u << m, true); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (some_plus_minus && some_absolute) + return sd_bus_error_set(error, BUS_ERROR_BAD_UNIT_SETTING, "Bad marker syntax."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (some_absolute) + u->markers = settings; + else + u->markers = settings | (u->markers & ~mask); + } + + return 1; + } + + return 0; +} + +static int bus_set_transient_emergency_action( + Unit *u, + const char *name, + EmergencyAction *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *s; + EmergencyAction v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + r = parse_emergency_action(s, u->manager->runtime_scope, &v); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + r == -EOPNOTSUPP ? "%s setting invalid for manager type: %s" + : "Invalid %s setting: %s", + name, s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, + "%s=%s", name, s); + } + + return 1; +} + +static int bus_set_transient_exit_status( + Unit *u, + const char *name, + int *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int32_t k; + int r; + + assert(p); + + r = sd_bus_message_read(message, "i", &k); + if (r < 0) + return r; + + if (k > 255) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Exit status must be in range 0…255 or negative."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = k < 0 ? -1 : k; + + if (k < 0) + unit_write_settingf(u, flags, name, "%s=", name); + else + unit_write_settingf(u, flags, name, "%s=%i", name, k); + } + + return 1; +} + +static BUS_DEFINE_SET_TRANSIENT_PARSE(collect_mode, CollectMode, collect_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(job_mode, JobMode, job_mode_from_string); + +static int bus_set_transient_conditions( + Unit *u, + const char *name, + Condition **list, + bool is_condition, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *type_name, *param; + int trigger, negate, r; + bool empty = true; + + assert(list); + + r = sd_bus_message_enter_container(message, 'a', "(sbbs)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(sbbs)", &type_name, &trigger, &negate, ¶m)) > 0) { + ConditionType t; + + t = is_condition ? condition_type_from_string(type_name) : assert_type_from_string(type_name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid condition type: %s", type_name); + + if (isempty(param)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Condition parameter in %s is empty", type_name); + + if (condition_takes_path(t) && !path_is_absolute(param)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in condition %s is not absolute: %s", type_name, param); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + Condition *c; + + c = condition_new(t, param, trigger, negate); + if (!c) + return -ENOMEM; + + LIST_PREPEND(conditions, *list, c); + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s%s%s", type_name, + trigger ? "|" : "", negate ? "!" : "", param); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + *list = condition_free_list(*list); + unit_write_settingf(u, flags, name, "%sNull=", is_condition ? "Condition" : "Assert"); + } + + return 1; +} + +static int bus_unit_set_transient_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + UnitDependency d; + int r; + + assert(u); + assert(name); + assert(message); + + /* Handles settings when transient units are created. This settings cannot be altered anymore after + * the unit has been created. */ + + if (streq(name, "SourcePath")) + return bus_set_transient_path(u, name, &u->source_path, message, flags, error); + + if (streq(name, "StopWhenUnneeded")) + return bus_set_transient_bool(u, name, &u->stop_when_unneeded, message, flags, error); + + if (streq(name, "RefuseManualStart")) + return bus_set_transient_bool(u, name, &u->refuse_manual_start, message, flags, error); + + if (streq(name, "RefuseManualStop")) + return bus_set_transient_bool(u, name, &u->refuse_manual_stop, message, flags, error); + + if (streq(name, "AllowIsolate")) + return bus_set_transient_bool(u, name, &u->allow_isolate, message, flags, error); + + if (streq(name, "DefaultDependencies")) + return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error); + + if (streq(name, "SurviveFinalKillSignal")) + return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error); + + if (streq(name, "OnSuccessJobMode")) + return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error); + + if (streq(name, "OnFailureJobMode")) + return bus_set_transient_job_mode(u, name, &u->on_failure_job_mode, message, flags, error); + + if (streq(name, "IgnoreOnIsolate")) + return bus_set_transient_bool(u, name, &u->ignore_on_isolate, message, flags, error); + + if (streq(name, "JobTimeoutUSec")) { + r = bus_set_transient_usec_fix_0(u, name, &u->job_timeout, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags) && !u->job_running_timeout_set) + u->job_running_timeout = u->job_timeout; + } + + if (streq(name, "JobRunningTimeoutUSec")) { + r = bus_set_transient_usec_fix_0(u, name, &u->job_running_timeout, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + u->job_running_timeout_set = true; + + return r; + } + + if (streq(name, "JobTimeoutAction")) + return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error); + + if (streq(name, "JobTimeoutRebootArgument")) + return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error); + + if (streq(name, "StartLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error); + + if (streq(name, "StartLimitBurst")) + return bus_set_transient_unsigned(u, name, &u->start_ratelimit.burst, message, flags, error); + + if (streq(name, "StartLimitAction")) + return bus_set_transient_emergency_action(u, name, &u->start_limit_action, message, flags, error); + + if (streq(name, "FailureAction")) + return bus_set_transient_emergency_action(u, name, &u->failure_action, message, flags, error); + + if (streq(name, "SuccessAction")) + return bus_set_transient_emergency_action(u, name, &u->success_action, message, flags, error); + + if (streq(name, "FailureActionExitStatus")) + return bus_set_transient_exit_status(u, name, &u->failure_action_exit_status, message, flags, error); + + if (streq(name, "SuccessActionExitStatus")) + return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error); + + if (streq(name, "RebootArgument")) + return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error); + + if (streq(name, "CollectMode")) + return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error); + + if (streq(name, "Conditions")) + return bus_set_transient_conditions(u, name, &u->conditions, true, message, flags, error); + + if (streq(name, "Asserts")) + return bus_set_transient_conditions(u, name, &u->asserts, false, message, flags, error); + + if (streq(name, "Documentation")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) + if (!documentation_url_is_valid(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid URL in %s: %s", name, *p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + u->documentation = strv_free(u->documentation); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + strv_extend_strv(&u->documentation, l, false); + + STRV_FOREACH(p, l) + unit_write_settingf(u, flags, name, "%s=%s", name, *p); + } + } + + return 1; + + } else if (streq(name, "Slice")) { + Unit *slice; + const char *s; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups."); + if (u->type == UNIT_SLICE) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units."); + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope"); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!unit_name_is_valid(s, UNIT_NAME_PLAIN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name '%s'", s); + + /* Note that we do not dispatch the load queue here yet, as we don't want our own transient unit to be + * loaded while we are still setting it up. Or in other words, we use manager_load_unit_prepare() + * instead of manager_load_unit() on purpose, here. */ + r = manager_load_unit_prepare(u->manager, s, NULL, error, &slice); + if (r < 0) + return r; + + if (slice->type != UNIT_SLICE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit name '%s' is not a slice", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_set_slice(u, slice); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_PRIVATE, name, "Slice=%s", s); + } + + return 1; + + } else if (streq(name, "RequiresMountsFor")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + path_simplify(*p); + + if (!path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not absolute: %s", name, *p); + + if (!path_is_valid(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s has invalid length: %s", name, *p); + + if (!path_is_normalized(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p); + + unit_write_settingf(u, flags, name, "%s=%s", name, *p); + } + } + + return 1; + } + + if (streq(name, "RequiresOverridable")) + d = UNIT_REQUIRES; /* redirect for obsolete unit dependency type */ + else if (streq(name, "RequisiteOverridable")) + d = UNIT_REQUISITE; /* same here */ + else + d = unit_dependency_from_string(name); + + if (d >= 0) { + const char *other; + + if (!IN_SET(d, + UNIT_REQUIRES, + UNIT_REQUISITE, + UNIT_WANTS, + UNIT_BINDS_TO, + UNIT_PART_OF, + UNIT_UPHOLDS, + UNIT_CONFLICTS, + UNIT_BEFORE, + UNIT_AFTER, + UNIT_ON_SUCCESS, + UNIT_ON_FAILURE, + UNIT_PROPAGATES_RELOAD_TO, + UNIT_RELOAD_PROPAGATED_FROM, + UNIT_PROPAGATES_STOP_TO, + UNIT_STOP_PROPAGATED_FROM, + UNIT_JOINS_NAMESPACE_OF)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Dependency type %s may not be created transiently.", unit_dependency_to_string(d)); + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "s", &other)) > 0) { + if (!unit_name_is_valid(other, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name %s", other); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *label = NULL; + + r = unit_add_dependency_by_name(u, d, other, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + label = strjoin(name, "-", other); + if (!label) + return -ENOMEM; + + unit_write_settingf(u, flags, label, "%s=%s", unit_dependency_to_string(d), other); + } + + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 1; + + } else if (streq(name, "AddRef")) { + + int b; + + /* Why is this called "AddRef" rather than just "Ref", or "Reference"? There's already a "Ref()" method + * on the Unit interface, and it's probably not a good idea to expose a property and a method on the + * same interface (well, strictly speaking AddRef isn't exposed as full property, we just read it for + * transient units, but still). And "References" and "ReferencedBy" is already used as unit reference + * dependency type, hence let's not confuse things with that. + * + * Note that we don't actually add the reference to the bus track. We do that only after the setup of + * the transient unit is complete, so that setting this property multiple times in the same transient + * unit creation call doesn't count as individual references. */ + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + u->bus_track_add = b; + + return 1; + } + + return 0; +} + +int bus_unit_set_properties( + Unit *u, + sd_bus_message *message, + UnitWriteFlags flags, + bool commit, + sd_bus_error *error) { + + bool for_real = false; + unsigned n = 0; + int r; + + assert(u); + assert(message); + + /* We iterate through the array twice. First run just checks if all passed data is valid, second run + * actually applies it. This implements transaction-like behaviour without actually providing full + * transactions. */ + + r = sd_bus_message_enter_container(message, 'a', "(sv)"); + if (r < 0) + goto error; + + for (;;) { + const char *name; + UnitWriteFlags f; + + r = sd_bus_message_enter_container(message, 'r', "sv"); + if (r < 0) + goto error; + if (r == 0) { + if (for_real || UNIT_WRITE_FLAGS_NOOP(flags)) + break; + + /* Reached EOF. Let's try again, and this time for realz... */ + r = sd_bus_message_rewind(message, false); + if (r < 0) + goto error; + + for_real = true; + continue; + } + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + goto error; + + r = sd_bus_message_enter_container(message, 'v', NULL); + if (r < 0) + goto error; + + /* If not for real, then mask out the two target flags */ + f = for_real ? flags : (flags & ~(UNIT_RUNTIME|UNIT_PERSISTENT)); + + if (UNIT_VTABLE(u)->bus_set_property) + r = UNIT_VTABLE(u)->bus_set_property(u, name, message, f, error); + else + r = 0; + if (r == 0 && u->transient && u->load_state == UNIT_STUB) + r = bus_unit_set_transient_property(u, name, message, f, error); + if (r == 0) + r = bus_unit_set_live_property(u, name, message, f, error); + if (r < 0) + goto error; + + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY, + "Cannot set property %s, or unknown property.", name); + + r = sd_bus_message_exit_container(message); + if (r < 0) + goto error; + + r = sd_bus_message_exit_container(message); + if (r < 0) + goto error; + + n += for_real; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + goto error; + + if (commit && n > 0 && UNIT_VTABLE(u)->bus_commit_properties) + UNIT_VTABLE(u)->bus_commit_properties(u); + + return n; + + error: + /* Pretty much any of the calls above can fail if the message is not formed properly + * or if it has unexpected contents. Fill in a more informative error message here. */ + if (sd_bus_error_is_set(error)) + return r; + return sd_bus_error_set_errnof(error, r, + r == -ENXIO ? "Failed to set unit properties: Unexpected message contents" + : "Failed to set unit properties: %m"); +} + +int bus_unit_validate_load_state(Unit *u, sd_bus_error *error) { + assert(u); + + /* Generates a pretty error if a unit isn't properly loaded. */ + + switch (u->load_state) { + + case UNIT_LOADED: + return 0; + + case UNIT_NOT_FOUND: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not found.", u->id); + + case UNIT_BAD_SETTING: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, "Unit %s has a bad unit file setting.", u->id); + + case UNIT_ERROR: /* Only show .load_error in UNIT_ERROR state */ + return sd_bus_error_set_errnof(error, u->load_error, + "Unit %s failed to load properly, please adjust/correct and reload service manager: %m", u->id); + + case UNIT_MASKED: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit %s is masked.", u->id); + + case UNIT_STUB: + case UNIT_MERGED: + default: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unexpected load state of unit %s", u->id); + } +} + +static int bus_unit_track_handler(sd_bus_track *t, void *userdata) { + Unit *u = ASSERT_PTR(userdata); + + assert(t); + + u->bus_track = sd_bus_track_unref(u->bus_track); /* make sure we aren't called again */ + + /* If the client that tracks us disappeared, then there's reason to believe that the cgroup is empty now too, + * let's see */ + unit_add_to_cgroup_empty_queue(u); + + /* Also add the unit to the GC queue, after all if the client left it might be time to GC this unit */ + unit_add_to_gc_queue(u); + + return 0; +} + +static int bus_unit_allocate_bus_track(Unit *u) { + int r; + + assert(u); + + if (u->bus_track) + return 0; + + r = sd_bus_track_new(u->manager->api_bus, &u->bus_track, bus_unit_track_handler, u); + if (r < 0) + return r; + + r = sd_bus_track_set_recursive(u->bus_track, true); + if (r < 0) { + u->bus_track = sd_bus_track_unref(u->bus_track); + return r; + } + + return 0; +} + +int bus_unit_track_add_name(Unit *u, const char *name) { + int r; + + assert(u); + + r = bus_unit_allocate_bus_track(u); + if (r < 0) + return r; + + return sd_bus_track_add_name(u->bus_track, name); +} + +int bus_unit_track_add_sender(Unit *u, sd_bus_message *m) { + int r; + + assert(u); + + r = bus_unit_allocate_bus_track(u); + if (r < 0) + return r; + + return sd_bus_track_add_sender(u->bus_track, m); +} + +int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m) { + assert(u); + + /* If we haven't allocated the bus track object yet, then there's definitely no reference taken yet, + * return an error */ + if (!u->bus_track) + return -EUNATCH; + + return sd_bus_track_remove_sender(u->bus_track, m); +} diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h new file mode 100644 index 0000000..6b7828e --- /dev/null +++ b/src/core/dbus-unit.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_unit_vtable[]; +extern const sd_bus_vtable bus_unit_cgroup_vtable[]; + +void bus_unit_send_change_signal(Unit *u); +void bus_unit_send_pending_change_signal(Unit *u, bool including_new); +int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled); +void bus_unit_send_removed_signal(Unit *u); + +int bus_unit_method_start_generic(sd_bus_message *message, Unit *u, JobType job_type, bool reload_if_possible, sd_bus_error *error); +int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error); +int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error); + +typedef enum BusUnitQueueFlags { + BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE = 1 << 0, + BUS_UNIT_QUEUE_VERBOSE_REPLY = 1 << 1, +} BusUnitQueueFlags; + +int bus_unit_queue_job_one( + sd_bus_message *message, + Unit *u, + JobType type, + JobMode mode, + BusUnitQueueFlags flags, + sd_bus_message *reply, + sd_bus_error *error); +int bus_unit_queue_job( + sd_bus_message *message, + Unit *u, + JobType type, + JobMode mode, + BusUnitQueueFlags flags, + sd_bus_error *error); +int bus_unit_validate_load_state(Unit *u, sd_bus_error *error); + +int bus_unit_track_add_name(Unit *u, const char *name); +int bus_unit_track_add_sender(Unit *u, sd_bus_message *m); +int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m); diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c new file mode 100644 index 0000000..d680a64 --- /dev/null +++ b/src/core/dbus-util.c @@ -0,0 +1,286 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-polkit.h" +#include "bus-util.h" +#include "dbus-util.h" +#include "escape.h" +#include "parse-util.h" +#include "path-util.h" +#include "unit-printf.h" +#include "user-util.h" +#include "unit.h" + +int bus_property_get_triggered_unit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *trigger; + + assert(bus); + assert(reply); + assert(u); + + trigger = UNIT_TRIGGER(u); + + return sd_bus_message_append(reply, "s", trigger ? trigger->id : NULL); +} + +BUS_DEFINE_SET_TRANSIENT(mode_t, "u", uint32_t, mode_t, "%04o"); +BUS_DEFINE_SET_TRANSIENT(unsigned, "u", uint32_t, unsigned, "%" PRIu32); + +static bool valid_user_group_name_or_id_relaxed(const char *u) { + return valid_user_group_name(u, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX); +} + +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed); +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute); + +int bus_set_transient_string( + Unit *u, + const char *name, + char **p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = free_and_strdup(p, empty_to_null(v)); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", name, strempty(v)); + } + + return 1; +} + +int bus_set_transient_bool( + Unit *u, + const char *name, + bool *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int v, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v)); + } + + return 1; +} + +int bus_set_transient_tristate( + Unit *u, + const char *name, + int *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int v, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v)); + } + + return 1; +} + +int bus_set_transient_usec_internal( + Unit *u, + const char *name, + usec_t *p, + bool fix_0, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint64_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (fix_0) + *p = v != 0 ? v: USEC_INFINITY; + else + *p = v; + + char *n = strndupa_safe(name, strlen(name) - 4); + unit_write_settingf(u, flags, name, "%sSec=%s", n, FORMAT_TIMESPAN(v, USEC_PER_MSEC)); + } + + return 1; +} + +int bus_verify_manage_units_async_full( + Unit *u, + const char *verb, + int capability, + const char *polkit_message, + bool interactive, + sd_bus_message *call, + sd_bus_error *error) { + + const char *details[9] = { + "unit", u->id, + "verb", verb, + }; + + if (polkit_message) { + details[4] = "polkit.message"; + details[5] = polkit_message; + details[6] = "polkit.gettext_domain"; + details[7] = GETTEXT_PACKAGE; + } + + return bus_verify_polkit_async( + call, + capability, + "org.freedesktop.systemd1.manage-units", + details, + interactive, + UID_INVALID, + &u->manager->polkit_registry, + error); +} + +/* ret_format_str is an accumulator, so if it has any pre-existing content, new options will be appended to it */ +int bus_read_mount_options( + sd_bus_message *message, + sd_bus_error *error, + MountOptions **ret_options, + char **ret_format_str, + const char *separator) { + + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *format_str = NULL; + const char *mount_options, *partition; + int r; + + assert(message); + assert(ret_options); + assert(separator); + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &partition, &mount_options)) > 0) { + _cleanup_free_ char *escaped = NULL; + _cleanup_free_ MountOptions *o = NULL; + PartitionDesignator partition_designator; + + if (chars_intersect(mount_options, WHITESPACE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid mount options string, contains whitespace character(s): %s", mount_options); + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid partition name %s", partition); + + /* Need to store the options with the escapes, so that they can be parsed again */ + escaped = shell_escape(mount_options, ":"); + if (!escaped) + return -ENOMEM; + + if (!strextend_with_separator(&format_str, separator, partition, ":", escaped)) + return -ENOMEM; + + o = new(MountOptions, 1); + if (!o) + return -ENOMEM; + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = strdup(mount_options), + }; + if (!o->options) + return -ENOMEM; + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (options) { + if (ret_format_str) { + char *final = strjoin(*ret_format_str, !isempty(*ret_format_str) ? separator : "", format_str); + if (!final) + return -ENOMEM; + free_and_replace(*ret_format_str, final); + } + LIST_JOIN(mount_options, *ret_options, options); + } + + return 0; +} + +int bus_property_get_activation_details( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ActivationDetails **details = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **pairs = NULL; + int r; + + assert(reply); + + r = activation_details_append_pair(*details, &pairs); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + STRV_FOREACH_PAIR(key, value, pairs) { + r = sd_bus_message_append(reply, "(ss)", *key, *value); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h new file mode 100644 index 0000000..9464b25 --- /dev/null +++ b/src/core/dbus-util.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "dissect-image.h" +#include "unit.h" + +int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +#define BUS_DEFINE_SET_TRANSIENT(function, bus_type, type, cast_type, fmt) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=" fmt, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_IS_VALID(function, bus_type, type, cast_type, fmt, check) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + if (!check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=" fmt, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_TO_STRING(function, bus_type, type, cast_type, fmt, to_string) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + s = to_string(v); \ + if (!s) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(function, bus_type, type, cast_type, fmt, to_string) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + _cleanup_free_ char *s = NULL; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + r = to_string(v, &s); \ + if (r == -EINVAL) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + if (r < 0) \ + return r; \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", \ + name, strempty(s)); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_PARSE(function, type, parse) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &s); \ + if (r < 0) \ + return r; \ + \ + v = parse(s); \ + if (v < 0) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, s); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(function, type, parse) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &s); \ + if (r < 0) \ + return r; \ + \ + r = parse(s, &v); \ + if (r < 0) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, s); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, strempty(s)); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(function, check) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + char **p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &v); \ + if (r < 0) \ + return r; \ + \ + if (!isempty(v) && !check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + r = free_and_strdup(p, empty_to_null(v)); \ + if (r < 0) \ + return r; \ + \ + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, \ + "%s=%s", name, strempty(v)); \ + } \ + \ + return 1; \ + } + +int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); +} +static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, true, message, flags, error); +} +int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error); + +int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator); + +int bus_property_get_activation_details(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); diff --git a/src/core/dbus.c b/src/core/dbus.c new file mode 100644 index 0000000..ba2cec4 --- /dev/null +++ b/src/core/dbus.c @@ -0,0 +1,1273 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-cgroup.h" +#include "dbus-device.h" +#include "dbus-execute.h" +#include "dbus-job.h" +#include "dbus-kill.h" +#include "dbus-manager.h" +#include "dbus-mount.h" +#include "dbus-path.h" +#include "dbus-scope.h" +#include "dbus-service.h" +#include "dbus-slice.h" +#include "dbus-socket.h" +#include "dbus-swap.h" +#include "dbus-target.h" +#include "dbus-timer.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "mkdir-label.h" +#include "process-util.h" +#include "selinux-access.h" +#include "serialize.h" +#include "service.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "umask-util.h" +#include "user-util.h" + +#define CONNECTIONS_MAX 4096 + +static void destroy_bus(Manager *m, sd_bus **bus); + +int bus_send_pending_reload_message(Manager *m) { + int r; + + assert(m); + + if (!m->pending_reload_message) + return 0; + + /* If we cannot get rid of this message we won't dispatch any D-Bus messages, so that we won't end up wanting + * to queue another message. */ + + r = sd_bus_send(NULL, m->pending_reload_message, NULL); + if (r < 0) + log_warning_errno(r, "Failed to send queued message, ignoring: %m"); + + m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message); + + return 0; +} + +int bus_forward_agent_released(Manager *m, const char *path) { + int r; + + assert(m); + assert(path); + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (!m->system_bus) + return 0; + + /* If we are running a system instance we forward the agent message on the system bus, so that the user + * instances get notified about this, too */ + + r = sd_bus_emit_signal(m->system_bus, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + "s", path); + if (r < 0) + return log_debug_errno(r, "Failed to propagate agent release message: %m"); + + return 1; +} + +static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *cgroup; + uid_t sender_uid; + int r; + + assert(message); + + /* only accept org.freedesktop.systemd1.Agent from UID=0 */ + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0 || sender_uid != 0) + return 0; + + /* parse 'cgroup-empty' notification */ + r = sd_bus_message_read(message, "s", &cgroup); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + manager_notify_cgroup_empty(m, cgroup); + return 0; +} + +static int signal_disconnected(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + sd_bus *bus; + + assert(message); + assert_se(bus = sd_bus_message_get_bus(message)); + + if (bus == m->api_bus) + bus_done_api(m); + if (bus == m->system_bus) + bus_done_system(m); + + if (set_remove(m->private_buses, bus)) { + log_debug("Got disconnect on private connection."); + destroy_bus(m, &bus); + } + + return 0; +} + +static int signal_activation_request(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *name; + Unit *u; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SOCKET) || + manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SERVICE)) { + r = sd_bus_error_set(&error, BUS_ERROR_SHUTTING_DOWN, "Refusing activation, D-Bus is shutting down."); + goto failed; + } + + r = manager_load_unit(m, name, NULL, &error, &u); + if (r < 0) + goto failed; + + if (u->refuse_manual_start) { + r = sd_bus_error_setf(&error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id); + goto failed; + } + + r = manager_add_job(m, JOB_START, u, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto failed; + + /* Successfully queued, that's it for us */ + return 0; + +failed: + if (!sd_bus_error_is_set(&error)) + sd_bus_error_set_errno(&error, r); + + log_debug("D-Bus activation failed for %s: %s", name, bus_error_message(&error, r)); + + r = sd_bus_message_new_signal(sd_bus_message_get_bus(message), &reply, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Activator", "ActivationFailure"); + if (r < 0) { + bus_log_create_error(r); + return 0; + } + + r = sd_bus_message_append(reply, "sss", name, error.name, error.message); + if (r < 0) { + bus_log_create_error(r); + return 0; + } + + r = sd_bus_send_to(NULL, reply, "org.freedesktop.DBus", NULL); + if (r < 0) + return log_error_errno(r, "Failed to respond with to bus activation request: %m"); + + return 0; +} + +#if HAVE_SELINUX +static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *verb, *path; + Unit *u = NULL; + Job *j; + int r; + + assert(message); + + /* Our own method calls are all protected individually with + * selinux checks, but the built-in interfaces need to be + * protected too. */ + + if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", "Set")) + verb = "reload"; + else if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Introspectable", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.ObjectManager", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Peer", NULL)) + verb = "status"; + else + return 0; + + path = sd_bus_message_get_path(message); + + if (object_path_startswith("/org/freedesktop/systemd1", path)) { + r = mac_selinux_access_check(message, verb, error); + if (r < 0) + return r; + + return 0; + } + + if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return 0; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return 0; + + u = manager_get_unit_by_pid(m, pid); + } else { + r = manager_get_job_from_dbus_path(m, path, &j); + if (r >= 0) + u = j->unit; + else + manager_load_unit_from_dbus_path(m, path, NULL, &u); + } + if (!u) + return 0; + + r = mac_selinux_unit_access_check(u, message, verb, error); + if (r < 0) + return r; + + return 0; +} +#endif + +static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_bus_error *error) { + Unit *u = NULL; /* just to appease gcc, initialization is not really necessary */ + int r; + + assert(m); + assert(bus); + assert(path); + + if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + sd_bus_message *message; + pid_t pid; + + message = sd_bus_get_current_message(bus); + if (!message) + return 0; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return 0; + } else { + r = manager_load_unit_from_dbus_path(m, path, error, &u); + if (r < 0) + return 0; + assert(u); + } + + *unit = u; + return 1; +} + +static int bus_unit_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(path); + assert(interface); + assert(found); + + return find_unit(m, bus, path, (Unit**) found, error); +} + +static int bus_unit_interface_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + *found = u; + return 1; +} + +static int bus_unit_cgroup_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + *found = u; + return 1; +} + +static int bus_cgroup_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + CGroupContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_exec_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + ExecContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_exec_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_kill_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + KillContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_kill_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_unit_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + unsigned k = 0; + Unit *u; + + l = new0(char*, hashmap_size(m->units)+1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(u, m->units) { + l[k] = unit_dbus_path(u); + if (!l[k]) + return -ENOMEM; + + k++; + } + + *nodes = TAKE_PTR(l); + + return k; +} + +static const BusObjectImplementation unit_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Unit", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_unit_vtable, bus_unit_find }), + .node_enumerator = bus_unit_enumerate, +}; + +static const BusObjectImplementation bus_automount_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Automount", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_automount_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_device_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Device", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_device_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_mount_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Mount", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_mount_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_path_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Path", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_path_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_scope_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Scope", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_scope_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_service_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Service", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_service_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_slice_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Slice", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_slice_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }), +}; + +static const BusObjectImplementation bus_socket_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Socket", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_socket_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_swap_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Swap", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_swap_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_target_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Target", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_target_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_timer_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Timer", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_timer_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_manager_object = { + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + .vtables = BUS_VTABLES(bus_manager_vtable), + .children = BUS_IMPLEMENTATIONS( + &job_object, + &unit_object, + &bus_automount_object, + &bus_device_object, + &bus_mount_object, + &bus_path_object, + &bus_scope_object, + &bus_service_object, + &bus_slice_object, + &bus_socket_object, + &bus_swap_object, + &bus_target_object, + &bus_timer_object), +}; + +static const BusObjectImplementation manager_log_control_object = { + "/org/freedesktop/LogControl1", + "org.freedesktop.LogControl1", + .vtables = BUS_VTABLES(bus_manager_log_control_vtable), +}; + +int bus_manager_introspect_implementations(FILE *out, const char *pattern) { + return bus_introspect_implementations( + out, + pattern, + BUS_IMPLEMENTATIONS(&bus_manager_object, + &manager_log_control_object)); +} + +static int bus_setup_api_vtables(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + +#if HAVE_SELINUX + r = sd_bus_add_filter(bus, NULL, mac_selinux_filter, m); + if (r < 0) + return log_error_errno(r, "Failed to add SELinux access filter: %m"); +#endif + + r = bus_add_implementation(bus, &bus_manager_object, m); + if (r < 0) + return r; + + return bus_add_implementation(bus, &manager_log_control_object, m); +} + +static int bus_setup_disconnected_match(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.DBus.Local", + "/org/freedesktop/DBus/Local", + "org.freedesktop.DBus.Local", + "Disconnected", + signal_disconnected, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for Disconnected message: %m"); + + return 0; +} + +static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int nfd = -EBADF; + Manager *m = ASSERT_PTR(userdata); + sd_id128_t id; + int r; + + assert(s); + + nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (nfd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + log_warning_errno(errno, "Failed to accept private connection, ignoring: %m"); + return 0; + } + + if (set_size(m->private_buses) >= CONNECTIONS_MAX) { + log_warning("Too many concurrent connections, refusing"); + return 0; + } + + r = sd_bus_new(&bus); + if (r < 0) { + log_warning_errno(r, "Failed to allocate new private connection bus: %m"); + return 0; + } + + (void) sd_bus_set_description(bus, "private-bus-connection"); + + r = sd_bus_set_fd(bus, nfd, nfd); + if (r < 0) { + log_warning_errno(r, "Failed to set fd on new connection bus: %m"); + return 0; + } + + TAKE_FD(nfd); + + r = bus_check_peercred(bus); + if (r < 0) { + log_warning_errno(r, "Incoming private connection from unprivileged client, refusing: %m"); + return 0; + } + + assert_se(sd_id128_randomize(&id) >= 0); + + r = sd_bus_set_server(bus, 1, id); + if (r < 0) { + log_warning_errno(r, "Failed to enable server support for new connection bus: %m"); + return 0; + } + + r = sd_bus_negotiate_creds(bus, 1, + SD_BUS_CREDS_PID|SD_BUS_CREDS_UID| + SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT| + SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION); + if (r < 0) { + log_warning_errno(r, "Failed to enable credentials for new connection: %m"); + return 0; + } + + r = sd_bus_set_sender(bus, "org.freedesktop.systemd1"); + if (r < 0) { + log_warning_errno(r, "Failed to set direct connection sender: %m"); + return 0; + } + + r = sd_bus_start(bus); + if (r < 0) { + log_warning_errno(r, "Failed to start new connection bus: %m"); + return 0; + } + + if (DEBUG_LOGGING) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL; + const char *comm = NULL, *description = NULL; + pid_t pid = 0; + + r = sd_bus_get_owner_creds(bus, SD_BUS_CREDS_PID|SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION, &c); + if (r < 0) + log_warning_errno(r, "Failed to get peer creds, ignoring: %m"); + else { + (void) sd_bus_creds_get_pid(c, &pid); + (void) sd_bus_creds_get_comm(c, &comm); + (void) sd_bus_creds_get_description(c, &description); + } + + log_debug("Accepting direct incoming connection from " PID_FMT " (%s) [%s]", pid, strna(comm), strna(description)); + } + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) { + log_warning_errno(r, "Failed to attach new connection bus to event loop: %m"); + return 0; + } + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return 0; + + r = bus_setup_api_vtables(m, bus); + if (r < 0) { + log_warning_errno(r, "Failed to set up API vtables on new connection bus: %m"); + return 0; + } + + r = bus_register_malloc_status(bus, "org.freedesktop.systemd1"); + if (r < 0) + log_warning_errno(r, "Failed to register MemoryAllocation1, ignoring: %m"); + + r = set_ensure_put(&m->private_buses, NULL, bus); + if (r == -ENOMEM) { + log_oom(); + return 0; + } + if (r < 0) { + log_warning_errno(r, "Failed to add new connection bus to set: %m"); + return 0; + } + + TAKE_PTR(bus); + + log_debug("Accepted new private connection."); + + return 0; +} + +static int bus_setup_api(Manager *m, sd_bus *bus) { + char *name; + Unit *u; + int r; + + assert(m); + assert(bus); + + /* Let's make sure we have enough credential bits so that we can make security and selinux decisions */ + r = sd_bus_negotiate_creds(bus, 1, + SD_BUS_CREDS_PID|SD_BUS_CREDS_UID| + SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT); + if (r < 0) + log_warning_errno(r, "Failed to enable credential passing, ignoring: %m"); + + r = bus_setup_api_vtables(m, bus); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(u, name, m->watch_bus) { + r = unit_install_bus_match(u, bus, name); + if (r < 0) + log_error_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name); + } + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.systemd1.Activator", + "ActivationRequest", + signal_activation_request, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to subscribe to activation signal: %m"); + + /* Allow replacing of our name, to ease implementation of reexecution, where we keep the old connection open + * until after the new connection is set up and the name installed to allow clients to synchronously wait for + * reexecution to finish */ + r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.systemd1", SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_ALLOW_REPLACEMENT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = bus_register_malloc_status(bus, "org.freedesktop.systemd1"); + if (r < 0) + log_warning_errno(r, "Failed to register MemoryAllocation1, ignoring: %m"); + + log_debug("Successfully connected to API bus."); + + return 0; +} + +int bus_init_api(Manager *m) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + if (m->api_bus) + return 0; + + /* The API and system bus is the same if we are running in system mode */ + if (MANAGER_IS_SYSTEM(m) && m->system_bus) + bus = sd_bus_ref(m->system_bus); + else { + if (MANAGER_IS_SYSTEM(m)) + r = sd_bus_open_system_with_description(&bus, "bus-api-system"); + else + r = sd_bus_open_user_with_description(&bus, "bus-api-user"); + if (r < 0) + return log_error_errno(r, "Failed to connect to API bus: %m"); + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach API bus to event loop: %m"); + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return r; + } + + r = bus_setup_api(m, bus); + if (r < 0) + return log_error_errno(r, "Failed to set up API bus: %m"); + + m->api_bus = TAKE_PTR(bus); + + return 0; +} + +static int bus_setup_system(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + + /* if we are a user instance we get the Released message via the system bus */ + if (MANAGER_IS_USER(m)) { + r = sd_bus_match_signal_async( + bus, + NULL, + NULL, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + signal_agent_released, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to request Released match on system bus: %m"); + } + + log_debug("Successfully connected to system bus."); + return 0; +} + +int bus_init_system(Manager *m) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + if (m->system_bus) + return 0; + + /* The API and system bus is the same if we are running in system mode */ + if (MANAGER_IS_SYSTEM(m) && m->api_bus) + bus = sd_bus_ref(m->api_bus); + else { + r = sd_bus_open_system_with_description(&bus, "bus-system"); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach system bus to event loop: %m"); + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return r; + } + + r = bus_setup_system(m, bus); + if (r < 0) + return log_error_errno(r, "Failed to set up system bus: %m"); + + m->system_bus = TAKE_PTR(bus); + + return 0; +} + +int bus_init_private(Manager *m) { + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + socklen_t sa_len; + sd_event_source *s; + int r; + + assert(m); + + if (m->private_listen_fd >= 0) + return 0; + + if (MANAGER_IS_SYSTEM(m)) { + + /* We want the private bus only when running as init */ + if (getpid_cached() != 1) + return 0; + + r = sockaddr_un_set_path(&sa.un, "/run/systemd/private"); + } else { + _cleanup_free_ char *joined = NULL; + const char *e; + + e = secure_getenv("XDG_RUNTIME_DIR"); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "XDG_RUNTIME_DIR is not set, refusing."); + + joined = path_join(e, "/systemd/private"); + if (!joined) + return log_oom(); + + r = sockaddr_un_set_path(&sa.un, joined); + } + if (r < 0) + return log_error_errno(r, "Can't set path for AF_UNIX socket to bind to: %m"); + sa_len = r; + + (void) mkdir_parents_label(sa.un.sun_path, 0755); + (void) sockaddr_un_unlink(&sa.un); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate private socket: %m"); + + WITH_UMASK(0077) + r = bind(fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "Failed to bind private socket: %m"); + + r = listen(fd, SOMAXCONN_DELUXE); + if (r < 0) + return log_error_errno(errno, "Failed to make private socket listening: %m"); + + /* Generate an inotify event in case somebody waits for this socket to appear using inotify() */ + (void) touch(sa.un.sun_path); + + r = sd_event_add_io(m->event, &s, fd, EPOLLIN, bus_on_connection, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source: %m"); + + (void) sd_event_source_set_description(s, "bus-connection"); + + m->private_listen_fd = TAKE_FD(fd); + m->private_listen_event_source = s; + + log_debug("Successfully created private D-Bus server."); + + return 0; +} + +static void destroy_bus(Manager *m, sd_bus **bus) { + Unit *u; + Job *j; + + assert(m); + assert(bus); + + if (!*bus) + return; + + /* Make sure all bus slots watching names are released. */ + HASHMAP_FOREACH(u, m->watch_bus) { + if (u->match_bus_slot && sd_bus_slot_get_bus(u->match_bus_slot) == *bus) + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + if (u->get_name_owner_slot && sd_bus_slot_get_bus(u->get_name_owner_slot) == *bus) + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); + } + + /* Get rid of tracked clients on this bus */ + if (m->subscribed && sd_bus_track_get_bus(m->subscribed) == *bus) + m->subscribed = sd_bus_track_unref(m->subscribed); + + HASHMAP_FOREACH(j, m->jobs) + if (j->bus_track && sd_bus_track_get_bus(j->bus_track) == *bus) + j->bus_track = sd_bus_track_unref(j->bus_track); + + HASHMAP_FOREACH(u, m->units) { + if (u->bus_track && sd_bus_track_get_bus(u->bus_track) == *bus) + u->bus_track = sd_bus_track_unref(u->bus_track); + + /* Get rid of pending freezer messages on this bus */ + if (u->pending_freezer_invocation && sd_bus_message_get_bus(u->pending_freezer_invocation) == *bus) + u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation); + } + + /* Get rid of queued message on this bus */ + if (m->pending_reload_message && sd_bus_message_get_bus(m->pending_reload_message) == *bus) + m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message); + + /* Possibly flush unwritten data, but only if we are + * unprivileged, since we don't want to sync here */ + if (!MANAGER_IS_SYSTEM(m)) + sd_bus_flush(*bus); + + /* And destroy the object */ + *bus = sd_bus_close_unref(*bus); +} + +void bus_done_api(Manager *m) { + destroy_bus(m, &m->api_bus); +} + +void bus_done_system(Manager *m) { + destroy_bus(m, &m->system_bus); +} + +void bus_done_private(Manager *m) { + sd_bus *b; + + assert(m); + + while ((b = set_steal_first(m->private_buses))) + destroy_bus(m, &b); + + m->private_buses = set_free(m->private_buses); + + m->private_listen_event_source = sd_event_source_disable_unref(m->private_listen_event_source); + m->private_listen_fd = safe_close(m->private_listen_fd); +} + +void bus_done(Manager *m) { + assert(m); + + bus_done_api(m); + bus_done_system(m); + bus_done_private(m); + + assert(!m->subscribed); + + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + bus_verify_polkit_async_registry_free(m->polkit_registry); +} + +int bus_fdset_add_all(Manager *m, FDSet *fds) { + sd_bus *b; + int fd; + + assert(m); + assert(fds); + + /* When we are about to reexecute we add all D-Bus fds to the + * set to pass over to the newly executed systemd. They won't + * be used there however, except thatt they are closed at the + * very end of deserialization, those making it possible for + * clients to synchronously wait for systemd to reexec by + * simply waiting for disconnection */ + + if (m->api_bus) { + fd = sd_bus_get_fd(m->api_bus); + if (fd >= 0) { + fd = fdset_put_dup(fds, fd); + if (fd < 0) + return fd; + } + } + + SET_FOREACH(b, m->private_buses) { + fd = sd_bus_get_fd(b); + if (fd >= 0) { + fd = fdset_put_dup(fds, fd); + if (fd < 0) + return fd; + } + } + + /* We don't offer any APIs on the system bus (well, unless it + * is the same as the API bus) hence we don't bother with it + * here */ + + return 0; +} + +int bus_foreach_bus( + Manager *m, + sd_bus_track *subscribed2, + int (*send_message)(sd_bus *bus, void *userdata), + void *userdata) { + + sd_bus *b; + int r, ret = 0; + + /* Send to all direct buses, unconditionally */ + SET_FOREACH(b, m->private_buses) { + + /* Don't bother with enqueuing these messages to clients that haven't started yet */ + if (sd_bus_is_ready(b) <= 0) + continue; + + r = send_message(b, userdata); + if (r < 0) + ret = r; + } + + /* Send to API bus, but only if somebody is subscribed */ + if (m->api_bus && + (sd_bus_track_count(m->subscribed) > 0 || + sd_bus_track_count(subscribed2) > 0)) { + r = send_message(m->api_bus, userdata); + if (r < 0) + ret = r; + } + + return ret; +} + +void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) { + const char *n; + + assert(f); + assert(prefix); + + for (n = sd_bus_track_first(t); n; n = sd_bus_track_next(t)) { + int c, j; + + c = sd_bus_track_count_name(t, n); + for (j = 0; j < c; j++) + (void) serialize_item(f, prefix, n); + } +} + +int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) { + int r; + + assert(m); + assert(t); + + if (strv_isempty(l)) + return 0; + + if (!m->api_bus) + return 0; + + if (!*t) { + r = sd_bus_track_new(m->api_bus, t, NULL, NULL); + if (r < 0) + return r; + } + + r = sd_bus_track_set_recursive(*t, recursive); + if (r < 0) + return r; + + return bus_track_add_name_many(*t, l); +} + +int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error); +} +int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.bypass-dump-ratelimit", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +uint64_t manager_bus_n_queued_write(Manager *m) { + uint64_t c = 0; + sd_bus *b; + int r; + + /* Returns the total number of messages queued for writing on all our direct and API buses. */ + + SET_FOREACH(b, m->private_buses) { + uint64_t k; + + r = sd_bus_get_n_queued_write(b, &k); + if (r < 0) + log_debug_errno(r, "Failed to query queued messages for private bus: %m"); + else + c += k; + } + + if (m->api_bus) { + uint64_t k; + + r = sd_bus_get_n_queued_write(m->api_bus, &k); + if (r < 0) + log_debug_errno(r, "Failed to query queued messages for API bus: %m"); + else + c += k; + } + + return c; +} + +static void vtable_dump_bus_properties(FILE *f, const sd_bus_vtable *table) { + const sd_bus_vtable *i; + + for (i = table; i->type != _SD_BUS_VTABLE_END; i++) { + if (!IN_SET(i->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY) || + (i->flags & (SD_BUS_VTABLE_DEPRECATED | SD_BUS_VTABLE_HIDDEN)) != 0) + continue; + + fprintf(f, "%s\n", i->x.property.member); + } +} + +void dump_bus_properties(FILE *f) { + assert(f); + + vtable_dump_bus_properties(f, bus_automount_vtable); + vtable_dump_bus_properties(f, bus_cgroup_vtable); + vtable_dump_bus_properties(f, bus_device_vtable); + vtable_dump_bus_properties(f, bus_exec_vtable); + vtable_dump_bus_properties(f, bus_job_vtable); + vtable_dump_bus_properties(f, bus_kill_vtable); + vtable_dump_bus_properties(f, bus_manager_vtable); + vtable_dump_bus_properties(f, bus_mount_vtable); + vtable_dump_bus_properties(f, bus_path_vtable); + vtable_dump_bus_properties(f, bus_scope_vtable); + vtable_dump_bus_properties(f, bus_service_vtable); + vtable_dump_bus_properties(f, bus_slice_vtable); + vtable_dump_bus_properties(f, bus_socket_vtable); + vtable_dump_bus_properties(f, bus_swap_vtable); + vtable_dump_bus_properties(f, bus_target_vtable); + vtable_dump_bus_properties(f, bus_timer_vtable); + vtable_dump_bus_properties(f, bus_unit_vtable); + vtable_dump_bus_properties(f, bus_unit_cgroup_vtable); +} diff --git a/src/core/dbus.h b/src/core/dbus.h new file mode 100644 index 0000000..50e7bb4 --- /dev/null +++ b/src/core/dbus.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "manager.h" + +int bus_send_pending_reload_message(Manager *m); + +int bus_init_private(Manager *m); +int bus_init_api(Manager *m); +int bus_init_system(Manager *m); + +void bus_done_private(Manager *m); +void bus_done_api(Manager *m); +void bus_done_system(Manager *m); +void bus_done(Manager *m); + +int bus_fdset_add_all(Manager *m, FDSet *fds); + +void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix); +int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l); + +int bus_foreach_bus(Manager *m, sd_bus_track *subscribed2, int (*send_message)(sd_bus *bus, void *userdata), void *userdata); + +int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error); + +int bus_forward_agent_released(Manager *m, const char *path); + +uint64_t manager_bus_n_queued_write(Manager *m); + +void dump_bus_properties(FILE *f); +int bus_manager_introspect_implementations(FILE *out, const char *pattern); diff --git a/src/core/device.c b/src/core/device.c new file mode 100644 index 0000000..6b2d7c3 --- /dev/null +++ b/src/core/device.c @@ -0,0 +1,1301 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "dbus-device.h" +#include "dbus-unit.h" +#include "device-private.h" +#include "device-util.h" +#include "device.h" +#include "log.h" +#include "parse-util.h" +#include "path-util.h" +#include "ratelimit.h" +#include "serialize.h" +#include "stat-util.h" +#include "string-util.h" +#include "swap.h" +#include "udev-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_DEVICE_STATE_MAX] = { + [DEVICE_DEAD] = UNIT_INACTIVE, + [DEVICE_TENTATIVE] = UNIT_ACTIVATING, + [DEVICE_PLUGGED] = UNIT_ACTIVE, +}; + +static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata); + +static int device_by_path(Manager *m, const char *path, Unit **ret) { + _cleanup_free_ char *e = NULL; + Unit *u; + int r; + + assert(m); + assert(path); + + r = unit_name_from_path(path, ".device", &e); + if (r < 0) + return r; + + u = manager_get_unit(m, e); + if (!u) + return -ENOENT; + + if (ret) + *ret = u; + return 0; +} + +static void device_unset_sysfs(Device *d) { + Hashmap *devices; + + assert(d); + + if (!d->sysfs) + return; + + /* Remove this unit from the chain of devices which share the same sysfs path. */ + + devices = UNIT(d)->manager->devices_by_sysfs; + + if (d->same_sysfs_prev) + /* If this is not the first unit, then simply remove this unit. */ + d->same_sysfs_prev->same_sysfs_next = d->same_sysfs_next; + else if (d->same_sysfs_next) + /* If this is the first unit, replace with the next unit. */ + assert_se(hashmap_replace(devices, d->same_sysfs_next->sysfs, d->same_sysfs_next) >= 0); + else + /* Otherwise, remove the entry. */ + hashmap_remove(devices, d->sysfs); + + if (d->same_sysfs_next) + d->same_sysfs_next->same_sysfs_prev = d->same_sysfs_prev; + + d->same_sysfs_prev = d->same_sysfs_next = NULL; + + d->sysfs = mfree(d->sysfs); +} + +static int device_set_sysfs(Device *d, const char *sysfs) { + _cleanup_free_ char *copy = NULL; + Device *first; + int r; + + assert(d); + + if (streq_ptr(d->sysfs, sysfs)) + return 0; + + r = hashmap_ensure_allocated(&UNIT(d)->manager->devices_by_sysfs, &path_hash_ops); + if (r < 0) + return r; + + copy = strdup(sysfs); + if (!copy) + return -ENOMEM; + + device_unset_sysfs(d); + + first = hashmap_get(UNIT(d)->manager->devices_by_sysfs, sysfs); + LIST_PREPEND(same_sysfs, first, d); + + r = hashmap_replace(UNIT(d)->manager->devices_by_sysfs, copy, first); + if (r < 0) { + LIST_REMOVE(same_sysfs, first, d); + return r; + } + + d->sysfs = TAKE_PTR(copy); + unit_add_to_dbus_queue(UNIT(d)); + + return 0; +} + +static void device_init(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + assert(UNIT(d)->load_state == UNIT_STUB); + + /* In contrast to all other unit types we timeout jobs waiting + * for devices by default. This is because they otherwise wait + * indefinitely for plugged in devices, something which cannot + * happen for the other units since their operations time out + * anyway. */ + u->job_running_timeout = u->manager->defaults.device_timeout_usec; + + u->ignore_on_isolate = true; + + d->deserialized_state = _DEVICE_STATE_INVALID; +} + +static void device_done(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + + device_unset_sysfs(d); + d->deserialized_sysfs = mfree(d->deserialized_sysfs); + d->wants_property = strv_free(d->wants_property); + d->path = mfree(d->path); +} + +static int device_load(Unit *u) { + int r; + + r = unit_load_fragment_and_dropin(u, false); + if (r < 0) + return r; + + if (!u->description) { + /* Generate a description based on the path, to be used until the device is initialized + properly */ + r = unit_name_to_path(u->id, &u->description); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to unescape name: %m"); + } + + return 0; +} + +static void device_set_state(Device *d, DeviceState state) { + DeviceState old_state; + + assert(d); + + if (d->state != state) + bus_unit_send_pending_change_signal(UNIT(d), false); + + old_state = d->state; + d->state = state; + + if (state == DEVICE_DEAD) + device_unset_sysfs(d); + + if (state != old_state) + log_unit_debug(UNIT(d), "Changed %s -> %s", device_state_to_string(old_state), device_state_to_string(state)); + + unit_notify(UNIT(d), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static void device_found_changed(Device *d, DeviceFound previous, DeviceFound now) { + assert(d); + + /* Didn't exist before, but does now? if so, generate a new invocation ID for it */ + if (previous == DEVICE_NOT_FOUND && now != DEVICE_NOT_FOUND) + (void) unit_acquire_invocation_id(UNIT(d)); + + if (FLAGS_SET(now, DEVICE_FOUND_UDEV)) + /* When the device is known to udev we consider it plugged. */ + device_set_state(d, DEVICE_PLUGGED); + else if (now != DEVICE_NOT_FOUND && !FLAGS_SET(previous, DEVICE_FOUND_UDEV)) + /* If the device has not been seen by udev yet, but is now referenced by the kernel, then we assume the + * kernel knows it now, and udev might soon too. */ + device_set_state(d, DEVICE_TENTATIVE); + else + /* If nobody sees the device, or if the device was previously seen by udev and now is only referenced + * from the kernel, then we consider the device is gone, the kernel just hasn't noticed it yet. */ + device_set_state(d, DEVICE_DEAD); +} + +static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask) { + assert(d); + + if (MANAGER_IS_RUNNING(UNIT(d)->manager)) { + DeviceFound n, previous; + + /* When we are already running, then apply the new mask right-away, and trigger state changes + * right-away */ + + n = (d->found & ~mask) | (found & mask); + if (n == d->found) + return; + + previous = d->found; + d->found = n; + + device_found_changed(d, previous, n); + } else + /* We aren't running yet, let's apply the new mask to the shadow variable instead, which we'll apply as + * soon as we catch-up with the state. */ + d->enumerated_found = (d->enumerated_found & ~mask) | (found & mask); +} + +static void device_update_found_by_sysfs(Manager *m, const char *sysfs, DeviceFound found, DeviceFound mask) { + Device *l; + + assert(m); + assert(sysfs); + + if (mask == 0) + return; + + l = hashmap_get(m->devices_by_sysfs, sysfs); + LIST_FOREACH(same_sysfs, d, l) + device_update_found_one(d, found, mask); +} + +static void device_update_found_by_name(Manager *m, const char *path, DeviceFound found, DeviceFound mask) { + Unit *u; + + assert(m); + assert(path); + + if (mask == 0) + return; + + if (device_by_path(m, path, &u) < 0) + return; + + device_update_found_one(DEVICE(u), found, mask); +} + +static int device_coldplug(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + assert(d->state == DEVICE_DEAD); + + /* First, let's put the deserialized state and found mask into effect, if we have it. */ + if (d->deserialized_state < 0) + return 0; + + Manager *m = u->manager; + DeviceFound found = d->deserialized_found; + DeviceState state = d->deserialized_state; + + /* On initial boot, switch-root, reload, reexecute, the following happen: + * 1. MANAGER_IS_RUNNING() == false + * 2. enumerate devices: manager_enumerate() -> device_enumerate() + * Device.enumerated_found is set. + * 3. deserialize devices: manager_deserialize() -> device_deserialize_item() + * Device.deserialize_state and Device.deserialized_found are set. + * 4. coldplug devices: manager_coldplug() -> device_coldplug() + * deserialized properties are copied to the main properties. + * 5. MANAGER_IS_RUNNING() == true: manager_ready() + * 6. catchup devices: manager_catchup() -> device_catchup() + * Device.enumerated_found is applied to Device.found, and state is updated based on that. + * + * Notes: + * - On initial boot, no udev database exists. Hence, no devices are enumerated in the step 2. + * Also, there is no deserialized device. Device units are (a) generated based on dependencies of + * other units, or (b) generated when uevents are received. + * + * - On switch-root, the udev database may be cleared, except for devices with sticky bit, i.e. + * OPTIONS="db_persist". Hence, almost no devices are enumerated in the step 2. However, in + * general, we have several serialized devices. So, DEVICE_FOUND_UDEV bit in the + * Device.deserialized_found must be ignored, as udev rules in initrd and the main system are often + * different. If the deserialized state is DEVICE_PLUGGED, we need to downgrade it to + * DEVICE_TENTATIVE. Unlike the other starting mode, MANAGER_IS_SWITCHING_ROOT() is true when + * device_coldplug() and device_catchup() are called. Hence, let's conditionalize the operations by + * using the flag. After switch-root, systemd-udevd will (re-)process all devices, and the + * Device.found and Device.state will be adjusted. + * + * - On reload or reexecute, we can trust Device.enumerated_found, Device.deserialized_found, and + * Device.deserialized_state. Of course, deserialized parameters may be outdated, but the unit + * state can be adjusted later by device_catchup() or uevents. */ + + if (MANAGER_IS_SWITCHING_ROOT(m) && + !FLAGS_SET(d->enumerated_found, DEVICE_FOUND_UDEV)) { + + /* The device has not been enumerated. On switching-root, such situation is natural. See the + * above comment. To prevent problematic state transition active → dead → active, let's + * drop the DEVICE_FOUND_UDEV flag and downgrade state to DEVICE_TENTATIVE(activating). See + * issue #12953 and #23208. */ + found &= ~DEVICE_FOUND_UDEV; + if (state == DEVICE_PLUGGED) + state = DEVICE_TENTATIVE; + + /* Also check the validity of the device syspath. Without this check, if the device was + * removed while switching root, it would never go to inactive state, as both Device.found + * and Device.enumerated_found do not have the DEVICE_FOUND_UDEV flag, so device_catchup() in + * device_update_found_one() does nothing in most cases. See issue #25106. Note that the + * syspath field is only serialized when systemd is sufficiently new and the device has been + * already processed by udevd. */ + if (d->deserialized_sysfs) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + if (sd_device_new_from_syspath(&dev, d->deserialized_sysfs) < 0) + state = DEVICE_DEAD; + } + } + + if (d->found == found && d->state == state) + return 0; + + d->found = found; + device_set_state(d, state); + return 0; +} + +static void device_catchup(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + + /* Second, let's update the state with the enumerated state */ + device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK); +} + +static const struct { + DeviceFound flag; + const char *name; +} device_found_map[] = { + { DEVICE_FOUND_UDEV, "found-udev" }, + { DEVICE_FOUND_MOUNT, "found-mount" }, + { DEVICE_FOUND_SWAP, "found-swap" }, +}; + +static int device_found_to_string_many(DeviceFound flags, char **ret) { + _cleanup_free_ char *s = NULL; + + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(device_found_map); i++) { + if (!FLAGS_SET(flags, device_found_map[i].flag)) + continue; + + if (!strextend_with_separator(&s, ",", device_found_map[i].name)) + return -ENOMEM; + } + + *ret = TAKE_PTR(s); + + return 0; +} + +static int device_found_from_string_many(const char *name, DeviceFound *ret) { + DeviceFound flags = 0; + int r; + + assert(ret); + + for (;;) { + _cleanup_free_ char *word = NULL; + DeviceFound f = 0; + unsigned i; + + r = extract_first_word(&name, &word, ",", 0); + if (r < 0) + return r; + if (r == 0) + break; + + for (i = 0; i < ELEMENTSOF(device_found_map); i++) + if (streq(word, device_found_map[i].name)) { + f = device_found_map[i].flag; + break; + } + + if (f == 0) + return -EINVAL; + + flags |= f; + } + + *ret = flags; + return 0; +} + +static int device_serialize(Unit *u, FILE *f, FDSet *fds) { + _cleanup_free_ char *s = NULL; + Device *d = DEVICE(u); + + assert(d); + assert(u); + assert(f); + assert(fds); + + if (d->sysfs) + (void) serialize_item(f, "sysfs", d->sysfs); + + if (d->path) + (void) serialize_item(f, "path", d->path); + + (void) serialize_item(f, "state", device_state_to_string(d->state)); + + if (device_found_to_string_many(d->found, &s) >= 0) + (void) serialize_item(f, "found", s); + + return 0; +} + +static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Device *d = DEVICE(u); + int r; + + assert(d); + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "sysfs")) { + if (!d->deserialized_sysfs) { + d->deserialized_sysfs = strdup(value); + if (!d->deserialized_sysfs) + log_oom_debug(); + } + + } else if (streq(key, "path")) { + if (!d->path) { + d->path = strdup(value); + if (!d->path) + log_oom_debug(); + } + + } else if (streq(key, "state")) { + DeviceState state; + + state = device_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value, ignoring: %s", value); + else + d->deserialized_state = state; + + } else if (streq(key, "found")) { + r = device_found_from_string_many(value, &d->deserialized_found); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse found value '%s', ignoring: %m", value); + + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void device_dump(Unit *u, FILE *f, const char *prefix) { + Device *d = DEVICE(u); + _cleanup_free_ char *s = NULL; + + assert(d); + + (void) device_found_to_string_many(d->found, &s); + + fprintf(f, + "%sDevice State: %s\n" + "%sDevice Path: %s\n" + "%sSysfs Path: %s\n" + "%sFound: %s\n", + prefix, device_state_to_string(d->state), + prefix, strna(d->path), + prefix, strna(d->sysfs), + prefix, strna(s)); + + STRV_FOREACH(i, d->wants_property) + fprintf(f, "%sudev SYSTEMD_WANTS: %s\n", + prefix, *i); +} + +static UnitActiveState device_active_state(Unit *u) { + assert(u); + + return state_translation_table[DEVICE(u)->state]; +} + +static const char *device_sub_state_to_string(Unit *u) { + assert(u); + + return device_state_to_string(DEVICE(u)->state); +} + +static int device_update_description(Unit *u, sd_device *dev, const char *path) { + _cleanup_free_ char *j = NULL; + const char *model, *label, *desc; + int r; + + assert(u); + assert(path); + + desc = path; + + if (dev && device_get_model_string(dev, &model) >= 0) { + desc = model; + + /* Try to concatenate the device model string with a label, if there is one */ + if (sd_device_get_property_value(dev, "ID_FS_LABEL", &label) >= 0 || + sd_device_get_property_value(dev, "ID_PART_ENTRY_NAME", &label) >= 0 || + sd_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER", &label) >= 0) { + + desc = j = strjoin(model, " ", label); + if (!j) + return log_oom(); + } + } + + r = unit_set_description(u, desc); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set device description: %m"); + + return 0; +} + +static int device_add_udev_wants(Unit *u, sd_device *dev) { + _cleanup_strv_free_ char **added = NULL; + const char *wants, *property; + Device *d = DEVICE(u); + int r; + + assert(d); + assert(dev); + + property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS"; + + r = sd_device_get_property_value(dev, property, &wants); + if (r < 0) + return 0; + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&wants, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to parse property %s with value %s: %m", property, wants); + + if (unit_name_is_valid(word, UNIT_NAME_TEMPLATE) && d->sysfs) { + _cleanup_free_ char *escaped = NULL; + + /* If the unit name is specified as template, then automatically fill in the sysfs path of the + * device as instance name, properly escaped. */ + + r = unit_name_path_escape(d->sysfs, &escaped); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to escape %s: %m", d->sysfs); + + r = unit_name_replace_instance(word, escaped, &k); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to build %s instance of template %s: %m", escaped, word); + } else { + /* If this is not a template, then let's mangle it so, that it becomes a valid unit name. */ + + r = unit_name_mangle(word, UNIT_NAME_MANGLE_WARN, &k); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to mangle unit name \"%s\": %m", word); + } + + r = unit_add_dependency_by_name(u, UNIT_WANTS, k, true, UNIT_DEPENDENCY_UDEV); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add Wants= dependency: %m"); + + r = strv_consume(&added, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } + + if (d->state != DEVICE_DEAD) + /* So here's a special hack, to compensate for the fact that the udev database's reload cycles are not + * synchronized with our own reload cycles: when we detect that the SYSTEMD_WANTS property of a device + * changes while the device unit is already up, let's skip to trigger units that were already listed + * and are active, and start units otherwise. This typically happens during the boot-time switch root + * transition, as udev devices will generally already be up in the initrd, but SYSTEMD_WANTS properties + * get then added through udev rules only available on the host system, and thus only when the initial + * udev coldplug trigger runs. + * + * We do this only if the device has been up already when we parse this, as otherwise the usual + * dependency logic that is run from the dead → plugged transition will trigger these deps. */ + STRV_FOREACH(i, added) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (strv_contains(d->wants_property, *i)) { + Unit *v; + + v = manager_get_unit(u->manager, *i); + if (v && UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(v))) + continue; /* The unit was already listed and is running. */ + } + + r = manager_add_job_by_name(u->manager, JOB_START, *i, JOB_FAIL, NULL, &error, NULL); + if (r < 0) + log_unit_full_errno(u, sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_UNIT) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to enqueue %s job, ignoring: %s", property, bus_error_message(&error, r)); + } + + return strv_free_and_replace(d->wants_property, added); +} + +static bool device_is_bound_by_mounts(Device *d, sd_device *dev) { + int r; + + assert(d); + assert(dev); + + r = device_get_property_bool(dev, "SYSTEMD_MOUNT_DEVICE_BOUND"); + if (r < 0 && r != -ENOENT) + log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_MOUNT_DEVICE_BOUND= udev property, ignoring: %m"); + + d->bind_mounts = r > 0; + + return d->bind_mounts; +} + +static void device_upgrade_mount_deps(Unit *u) { + Unit *other; + void *v; + int r; + + /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */ + + HASHMAP_FOREACH_KEY(v, other, unit_get_dependencies(u, UNIT_REQUIRED_BY)) { + if (other->type != UNIT_MOUNT) + continue; + + r = unit_add_dependency(other, UNIT_BINDS_TO, u, true, UNIT_DEPENDENCY_UDEV); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to add BindsTo= dependency between device and mount unit, ignoring: %m"); + } +} + +static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool main, Set **units) { + _cleanup_(unit_freep) Unit *new_unit = NULL; + _cleanup_free_ char *e = NULL; + const char *sysfs = NULL; + Unit *u; + int r; + + assert(m); + assert(path); + + if (dev) { + r = sd_device_get_syspath(dev, &sysfs); + if (r < 0) + return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m"); + } + + r = unit_name_from_path(path, ".device", &e); + if (r < 0) + return log_struct_errno( + LOG_WARNING, r, + "MESSAGE_ID=" SD_MESSAGE_DEVICE_PATH_NOT_SUITABLE_STR, + "DEVICE=%s", path, + LOG_MESSAGE("Failed to generate valid unit name from device path '%s', ignoring device: %m", + path)); + + u = manager_get_unit(m, e); + if (u) { + /* The device unit can still be present even if the device was unplugged: a mount unit can reference it + * hence preventing the GC to have garbaged it. That's desired since the device unit may have a + * dependency on the mount unit which was added during the loading of the later. When the device is + * plugged the sysfs might not be initialized yet, as we serialize the device's state but do not + * serialize the sysfs path across reloads/reexecs. Hence, when coming back from a reload/restart we + * might have the state valid, but not the sysfs path. Also, there is another possibility; when multiple + * devices have the same devlink (e.g. /dev/disk/by-uuid/xxxx), adding/updating/removing one of the + * device causes syspath change. Hence, let's always update sysfs path. */ + + /* Let's remove all dependencies generated due to udev properties. We'll re-add whatever is configured + * now below. */ + unit_remove_dependencies(u, UNIT_DEPENDENCY_UDEV); + + } else { + r = unit_new_for_name(m, sizeof(Device), e, &new_unit); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to allocate device unit %s: %m", e); + + u = new_unit; + + unit_add_to_load_queue(u); + } + + if (!DEVICE(u)->path) { + DEVICE(u)->path = strdup(path); + if (!DEVICE(u)->path) + return log_oom(); + } + + /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be + * initialized. Hence initialize it if necessary. */ + if (sysfs) { + r = device_set_sysfs(DEVICE(u), sysfs); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs); + + /* The additional systemd udev properties we only interpret for the main object */ + if (main) + (void) device_add_udev_wants(u, dev); + } + + (void) device_update_description(u, dev, path); + + /* So the user wants the mount units to be bound to the device but a mount unit might has been seen + * by systemd before the device appears on its radar. In this case the device unit is partially + * initialized and includes the deps on the mount unit but at that time the "bind mounts" flag wasn't + * present. Fix this up now. */ + if (dev && device_is_bound_by_mounts(DEVICE(u), dev)) + device_upgrade_mount_deps(u); + + if (units) { + r = set_ensure_put(units, NULL, DEVICE(u)); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to store unit: %m"); + } + + TAKE_PTR(new_unit); + return 0; +} + +static bool device_is_ready(sd_device *dev) { + int r; + + assert(dev); + + if (device_for_action(dev, SD_DEVICE_REMOVE)) + return false; + + r = device_is_renaming(dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to check if device is renaming, assuming device is not renaming: %m"); + if (r > 0) { + log_device_debug(dev, "Device busy: device is renaming"); + return false; + } + + /* Is it really tagged as 'systemd' right now? */ + r = sd_device_has_current_tag(dev, "systemd"); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to check if device has \"systemd\" tag, assuming device is not tagged with \"systemd\": %m"); + if (r == 0) + log_device_debug(dev, "Device busy: device is not tagged with \"systemd\""); + if (r <= 0) + return false; + + r = device_get_property_bool(dev, "SYSTEMD_READY"); + if (r < 0 && r != -ENOENT) + log_device_warning_errno(dev, r, "Failed to get device SYSTEMD_READY property, assuming device does not have \"SYSTEMD_READY\" property: %m"); + if (r == 0) + log_device_debug(dev, "Device busy: SYSTEMD_READY property from device is false"); + + return r != 0; +} + +static int device_setup_devlink_unit_one(Manager *m, const char *devlink, Set **ready_units, Set **not_ready_units) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + Unit *u; + + assert(m); + assert(devlink); + assert(ready_units); + assert(not_ready_units); + + if (sd_device_new_from_devname(&dev, devlink) >= 0 && device_is_ready(dev)) + return device_setup_unit(m, dev, devlink, /* main = */ false, ready_units); + + /* the devlink is already removed or not ready */ + if (device_by_path(m, devlink, &u) < 0) + return 0; /* The corresponding .device unit not found. That's fine. */ + + return set_ensure_put(not_ready_units, NULL, DEVICE(u)); +} + +static int device_setup_extra_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) { + _cleanup_strv_free_ char **aliases = NULL; + const char *syspath, *devname = NULL; + Device *l; + int r; + + assert(m); + assert(dev); + assert(ready_units); + assert(not_ready_units); + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) + return r; + + (void) sd_device_get_devname(dev, &devname); + + /* devlink units */ + FOREACH_DEVICE_DEVLINK(dev, devlink) { + /* These are a kind of special devlink. They should be always unique, but neither persistent + * nor predictable. Hence, let's refuse them. See also the comments for alias units below. */ + if (PATH_STARTSWITH_SET(devlink, "/dev/block/", "/dev/char/")) + continue; + + (void) device_setup_devlink_unit_one(m, devlink, ready_units, not_ready_units); + } + + if (device_is_ready(dev)) { + const char *s; + + r = sd_device_get_property_value(dev, "SYSTEMD_ALIAS", &s); + if (r < 0 && r != -ENOENT) + log_device_warning_errno(dev, r, "Failed to get SYSTEMD_ALIAS property, ignoring: %m"); + if (r >= 0) { + r = strv_split_full(&aliases, s, NULL, EXTRACT_UNQUOTE); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_ALIAS property, ignoring: %m"); + } + } + + /* alias units */ + STRV_FOREACH(alias, aliases) { + if (!path_is_absolute(*alias)) { + log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not an absolute path, ignoring.", *alias); + continue; + } + + if (!path_is_safe(*alias)) { + log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not safe, ignoring.", *alias); + continue; + } + + /* Note, even if the devlink is not persistent, LVM expects /dev/block/ symlink units exist. + * To achieve that, they set the path to SYSTEMD_ALIAS. Hence, we cannot refuse aliases start + * with /dev/, unfortunately. */ + + (void) device_setup_unit(m, dev, *alias, /* main = */ false, ready_units); + } + + l = hashmap_get(m->devices_by_sysfs, syspath); + LIST_FOREACH(same_sysfs, d, l) { + if (!d->path) + continue; + + if (path_equal(d->path, syspath)) + continue; /* This is the main unit. */ + + if (devname && path_equal(d->path, devname)) + continue; /* This is the real device node. */ + + if (device_has_devlink(dev, d->path)) + continue; /* The devlink was already processed in the above loop. */ + + if (strv_contains(aliases, d->path)) + continue; /* This is already processed in the above, and ready. */ + + if (path_startswith(d->path, "/dev/")) + /* This is a devlink unit. Check existence and update syspath. */ + (void) device_setup_devlink_unit_one(m, d->path, ready_units, not_ready_units); + else + /* This is an alias unit of dropped or not ready device. */ + (void) set_ensure_put(not_ready_units, NULL, d); + } + + return 0; +} + +static int device_setup_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) { + const char *syspath, *devname = NULL; + int r; + + assert(m); + assert(dev); + assert(ready_units); + assert(not_ready_units); + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) + return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m"); + + /* First, process the main (that is, points to the syspath) and (real, not symlink) devnode units. */ + if (device_for_action(dev, SD_DEVICE_REMOVE)) + /* If the device is removed, the main and devnode units will be removed by + * device_update_found_by_sysfs() in device_dispatch_io(). Hence, it is not necessary to + * store them to not_ready_units, and we have nothing to do here. + * + * Note, still we need to process devlink units below, as a devlink previously points to this + * device may still exist and now point to another device node. That is, do not forget to + * call device_setup_extra_units(). */ + ; + else if (device_is_ready(dev)) { + /* Add the main unit named after the syspath. If this one fails, don't bother with the rest, + * as this one shall be the main device unit the others just follow. (Compare with how + * device_following() is implemented, see below, which looks for the sysfs device.) */ + r = device_setup_unit(m, dev, syspath, /* main = */ true, ready_units); + if (r < 0) + return r; + + /* Add an additional unit for the device node */ + if (sd_device_get_devname(dev, &devname) >= 0) + (void) device_setup_unit(m, dev, devname, /* main = */ false, ready_units); + + } else { + Unit *u; + + /* If the device exists but not ready, then save the units and unset udev bits later. */ + + if (device_by_path(m, syspath, &u) >= 0) { + r = set_ensure_put(not_ready_units, NULL, DEVICE(u)); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m"); + } + + if (sd_device_get_devname(dev, &devname) >= 0 && + device_by_path(m, devname, &u) >= 0) { + r = set_ensure_put(not_ready_units, NULL, DEVICE(u)); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m"); + } + } + + /* Next, add/update additional .device units point to aliases and symlinks. */ + (void) device_setup_extra_units(m, dev, ready_units, not_ready_units); + + /* Safety check: no unit should be in ready_units and not_ready_units simultaneously. */ + Unit *u; + SET_FOREACH(u, *not_ready_units) + if (set_remove(*ready_units, u)) + log_unit_error(u, "Cannot activate and deactivate the unit simultaneously. Deactivating."); + + return 0; +} + +static Unit *device_following(Unit *u) { + Device *d = DEVICE(u); + Device *first = NULL; + + assert(d); + + if (startswith(u->id, "sys-")) + return NULL; + + /* Make everybody follow the unit that's named after the sysfs path */ + LIST_FOREACH(same_sysfs, other, d->same_sysfs_next) + if (startswith(UNIT(other)->id, "sys-")) + return UNIT(other); + + LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) { + if (startswith(UNIT(other)->id, "sys-")) + return UNIT(other); + + first = other; + } + + return UNIT(first); +} + +static int device_following_set(Unit *u, Set **_set) { + Device *d = DEVICE(u); + _cleanup_set_free_ Set *set = NULL; + int r; + + assert(d); + assert(_set); + + if (LIST_JUST_US(same_sysfs, d)) { + *_set = NULL; + return 0; + } + + set = set_new(NULL); + if (!set) + return -ENOMEM; + + LIST_FOREACH(same_sysfs, other, d->same_sysfs_next) { + r = set_put(set, other); + if (r < 0) + return r; + } + + LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) { + r = set_put(set, other); + if (r < 0) + return r; + } + + *_set = TAKE_PTR(set); + return 1; +} + +static void device_shutdown(Manager *m) { + assert(m); + + m->device_monitor = sd_device_monitor_unref(m->device_monitor); + m->devices_by_sysfs = hashmap_free(m->devices_by_sysfs); +} + +static void device_enumerate(Manager *m) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(m); + + if (!m->device_monitor) { + r = sd_device_monitor_new(&m->device_monitor); + if (r < 0) { + log_error_errno(r, "Failed to allocate device monitor: %m"); + goto fail; + } + + r = sd_device_monitor_filter_add_match_tag(m->device_monitor, "systemd"); + if (r < 0) { + log_error_errno(r, "Failed to add udev tag match: %m"); + goto fail; + } + + r = sd_device_monitor_attach_event(m->device_monitor, m->event); + if (r < 0) { + log_error_errno(r, "Failed to attach event to device monitor: %m"); + goto fail; + } + + r = sd_device_monitor_start(m->device_monitor, device_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to start device monitor: %m"); + goto fail; + } + } + + r = sd_device_enumerator_new(&e); + if (r < 0) { + log_error_errno(r, "Failed to allocate device enumerator: %m"); + goto fail; + } + + r = sd_device_enumerator_add_match_tag(e, "systemd"); + if (r < 0) { + log_error_errno(r, "Failed to set tag for device enumeration: %m"); + goto fail; + } + + FOREACH_DEVICE(e, dev) { + _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL; + Device *d; + + if (device_setup_units(m, dev, &ready_units, ¬_ready_units) < 0) + continue; + + SET_FOREACH(d, ready_units) + device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV); + SET_FOREACH(d, not_ready_units) + device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV); + } + + return; + +fail: + device_shutdown(m); +} + +static void device_propagate_reload(Manager *m, Device *d) { + int r; + + assert(m); + assert(d); + + if (d->state == DEVICE_DEAD) + return; + + r = manager_propagate_reload(m, UNIT(d), JOB_REPLACE, NULL); + if (r < 0) + log_unit_warning_errno(UNIT(d), r, "Failed to propagate reload, ignoring: %m"); +} + +static void device_remove_old_on_move(Manager *m, sd_device *dev) { + _cleanup_free_ char *syspath_old = NULL; + const char *devpath_old; + int r; + + assert(m); + assert(dev); + + r = sd_device_get_property_value(dev, "DEVPATH_OLD", &devpath_old); + if (r < 0) + return (void) log_device_debug_errno(dev, r, "Failed to get DEVPATH_OLD= property on 'move' uevent, ignoring: %m"); + + syspath_old = path_join("/sys", devpath_old); + if (!syspath_old) + return (void) log_oom(); + + device_update_found_by_sysfs(m, syspath_old, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK); +} + +static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata) { + _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL; + Manager *m = ASSERT_PTR(userdata); + sd_device_action_t action; + const char *sysfs; + bool ready; + Device *d; + int r; + + assert(dev); + + log_device_uevent(dev, "Processing udev action"); + + r = sd_device_get_syspath(dev, &sysfs); + if (r < 0) { + log_device_warning_errno(dev, r, "Failed to get device syspath, ignoring: %m"); + return 0; + } + + r = sd_device_get_action(dev, &action); + if (r < 0) { + log_device_warning_errno(dev, r, "Failed to get udev action, ignoring: %m"); + return 0; + } + + log_device_debug(dev, "Got '%s' action on syspath '%s'.", device_action_to_string(action), sysfs); + + if (action == SD_DEVICE_MOVE) + device_remove_old_on_move(m, dev); + + /* When udevd failed to process the device, SYSTEMD_ALIAS or any other properties may contain invalid + * values. Let's refuse to handle the uevent. */ + if (sd_device_get_property_value(dev, "UDEV_WORKER_FAILED", NULL) >= 0) { + int v; + + if (device_get_property_int(dev, "UDEV_WORKER_ERRNO", &v) >= 0) + log_device_warning_errno(dev, v, "systemd-udevd failed to process the device, ignoring: %m"); + else if (device_get_property_int(dev, "UDEV_WORKER_EXIT_STATUS", &v) >= 0) + log_device_warning(dev, "systemd-udevd failed to process the device with exit status %i, ignoring.", v); + else if (device_get_property_int(dev, "UDEV_WORKER_SIGNAL", &v) >= 0) { + const char *s; + (void) sd_device_get_property_value(dev, "UDEV_WORKER_SIGNAL_NAME", &s); + log_device_warning(dev, "systemd-udevd failed to process the device with signal %i(%s), ignoring.", v, strna(s)); + } else + log_device_warning(dev, "systemd-udevd failed to process the device with unknown result, ignoring."); + + return 0; + } + + /* A change event can signal that a device is becoming ready, in particular if the device is using + * the SYSTEMD_READY logic in udev so we need to reach the else block of the following if, even for + * change events */ + ready = device_is_ready(dev); + + (void) device_setup_units(m, dev, &ready_units, ¬_ready_units); + + if (action == SD_DEVICE_REMOVE) { + r = swap_process_device_remove(m, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process swap device remove event, ignoring: %m"); + } else if (ready) { + r = swap_process_device_new(m, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process swap device new event, ignoring: %m"); + } + + if (!IN_SET(action, SD_DEVICE_ADD, SD_DEVICE_REMOVE, SD_DEVICE_MOVE)) + SET_FOREACH(d, ready_units) + device_propagate_reload(m, d); + + if (!set_isempty(ready_units)) + manager_dispatch_load_queue(m); + + if (action == SD_DEVICE_REMOVE) + /* If we get notified that a device was removed by udev, then it's completely gone, hence + * unset all found bits. Note this affects all .device units still point to the removed + * device. */ + device_update_found_by_sysfs(m, sysfs, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK); + + /* These devices are found and ready now, set the udev found bit. Note, this is also necessary to do + * on remove uevent, as some devlinks may be updated and now point to other device nodes. */ + SET_FOREACH(d, ready_units) + device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV); + + /* These devices may be nominally around, but not ready for us. Hence unset the udev bit, but leave + * the rest around. This may be redundant for remove uevent, but should be harmless. */ + SET_FOREACH(d, not_ready_units) + device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV); + + return 0; +} + +void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask) { + int r; + + assert(m); + assert(node); + assert(!FLAGS_SET(mask, DEVICE_FOUND_UDEV)); + + if (!udev_available()) + return; + + if (mask == 0) + return; + + /* This is called whenever we find a device referenced in /proc/swaps or /proc/self/mounts. Such a device might + * be mounted/enabled at a time where udev has not finished probing it yet, and we thus haven't learned about + * it yet. In this case we will set the device unit to "tentative" state. + * + * This takes a pair of DeviceFound flags parameters. The 'mask' parameter is a bit mask that indicates which + * bits of 'found' to copy into the per-device DeviceFound flags field. Thus, this function may be used to set + * and unset individual bits in a single call, while merging partially with previous state. */ + + if ((found & mask) != 0) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + /* If the device is known in the kernel and newly appeared, then we'll create a device unit for it, + * under the name referenced in /proc/swaps or /proc/self/mountinfo. But first, let's validate if + * everything is alright with the device node. Note that we're fine with missing device nodes, + * but not with badly set up ones. */ + + r = sd_device_new_from_devname(&dev, node); + if (r == -ENODEV) + log_debug("Could not find device for %s, continuing without device node", node); + else if (r < 0) { + /* Reduce log noise from nodes which are not device nodes by skipping EINVAL. */ + if (r != -EINVAL) + log_error_errno(r, "Failed to open %s device, ignoring: %m", node); + return; + } + + (void) device_setup_unit(m, dev, node, /* main = */ false, NULL); /* 'dev' may be NULL. */ + } + + /* Update the device unit's state, should it exist */ + (void) device_update_found_by_name(m, node, found, mask); +} + +bool device_shall_be_bound_by(Unit *device, Unit *u) { + assert(device); + assert(u); + + if (u->type != UNIT_MOUNT) + return false; + + return DEVICE(device)->bind_mounts; +} + +const UnitVTable device_vtable = { + .object_size = sizeof(Device), + .sections = + "Unit\0" + "Device\0" + "Install\0", + + .gc_jobs = true, + + .init = device_init, + .done = device_done, + .load = device_load, + + .coldplug = device_coldplug, + .catchup = device_catchup, + + .serialize = device_serialize, + .deserialize_item = device_deserialize_item, + + .dump = device_dump, + + .active_state = device_active_state, + .sub_state_to_string = device_sub_state_to_string, + + .following = device_following, + .following_set = device_following_set, + + .enumerate = device_enumerate, + .shutdown = device_shutdown, + .supported = udev_available, + + .status_message_formats = { + .starting_stopping = { + [0] = "Expecting device %s...", + [1] = "Waiting for device %s to disappear...", + }, + .finished_start_job = { + [JOB_DONE] = "Found device %s.", + [JOB_TIMEOUT] = "Timed out waiting for device %s.", + }, + }, +}; diff --git a/src/core/device.h b/src/core/device.h new file mode 100644 index 0000000..9dd6fb5 --- /dev/null +++ b/src/core/device.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +typedef struct Device Device; + +/* A mask specifying where we have seen the device currently. This is a bitmask because the device might show up + * asynchronously from each other at various places. For example, in very common case a device might already be mounted + * before udev finished probing it (think: a script setting up a loopback block device, formatting it and mounting it + * in quick succession). Hence we need to track precisely where it is already visible and where not. */ +typedef enum DeviceFound { + DEVICE_NOT_FOUND = 0, + DEVICE_FOUND_UDEV = 1 << 0, /* The device has shown up in the udev database */ + DEVICE_FOUND_MOUNT = 1 << 1, /* The device has shown up in /proc/self/mountinfo */ + DEVICE_FOUND_SWAP = 1 << 2, /* The device has shown up in /proc/swaps */ + DEVICE_FOUND_MASK = DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP, +} DeviceFound; + +struct Device { + Unit meta; + + char *sysfs, *deserialized_sysfs; + char *path; /* syspath, device node, alias, or devlink */ + + /* In order to be able to distinguish dependencies on different device nodes we might end up creating multiple + * devices for the same sysfs path. We chain them up here. */ + LIST_FIELDS(struct Device, same_sysfs); + + DeviceState state, deserialized_state; + DeviceFound found, deserialized_found, enumerated_found; + + bool bind_mounts; + + /* The SYSTEMD_WANTS udev property for this device the last time we saw it */ + char **wants_property; +}; + +extern const UnitVTable device_vtable; + +void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask); +bool device_shall_be_bound_by(Unit *device, Unit *u); + +DEFINE_CAST(DEVICE, Device); diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c new file mode 100644 index 0000000..12724c6 --- /dev/null +++ b/src/core/dynamic-user.c @@ -0,0 +1,871 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "clean-ipc.h" +#include "dynamic-user.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "lock-util.h" +#include "nscd-flush.h" +#include "parse-util.h" +#include "random-util.h" +#include "serialize.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "uid-alloc-range.h" +#include "user-util.h" + +/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */ +#define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN) + +DEFINE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user); + +DynamicUser* dynamic_user_free(DynamicUser *d) { + if (!d) + return NULL; + + if (d->manager) + (void) hashmap_remove(d->manager->dynamic_users, d->name); + + safe_close_pair(d->storage_socket); + return mfree(d); +} + +static int dynamic_user_add(Manager *m, const char *name, int storage_socket[static 2], DynamicUser **ret) { + DynamicUser *d; + int r; + + assert(m || ret); + assert(name); + assert(storage_socket); + + if (m) { /* Might be called in sd-executor with no manager object */ + r = hashmap_ensure_allocated(&m->dynamic_users, &string_hash_ops); + if (r < 0) + return r; + } + + d = malloc0(offsetof(DynamicUser, name) + strlen(name) + 1); + if (!d) + return -ENOMEM; + + strcpy(d->name, name); + + d->storage_socket[0] = storage_socket[0]; + d->storage_socket[1] = storage_socket[1]; + + if (m) { /* Might be called in sd-executor with no manager object */ + r = hashmap_put(m->dynamic_users, d->name, d); + if (r < 0) { + free(d); + return r; + } + } + + d->manager = m; + + if (ret) + *ret = d; + + return 0; +} + +static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) { + _cleanup_close_pair_ int storage_socket[2] = EBADF_PAIR; + DynamicUser *d; + int r; + + assert(m); + assert(name); + + /* Return the DynamicUser structure for a specific user name. Note that this won't actually allocate a UID for + * it, but just prepare the data structure for it. The UID is allocated only on demand, when it's really + * needed, and in the child process we fork off, since allocation involves NSS checks which are not OK to do + * from PID 1. To allow the children and PID 1 share information about allocated UIDs we use an anonymous + * AF_UNIX/SOCK_DGRAM socket (called the "storage socket") that contains at most one datagram with the + * allocated UID number, plus an fd referencing the lock file for the UID + * (i.e. /run/systemd/dynamic-uid/$UID). Why involve the socket pair? So that PID 1 and all its children can + * share the same storage for the UID and lock fd, simply by inheriting the storage socket fds. The socket pair + * may exist in three different states: + * + * a) no datagram stored. This is the initial state. In this case the dynamic user was never realized. + * + * b) a datagram containing a UID stored, but no lock fd attached to it. In this case there was already a + * statically assigned UID by the same name, which we are reusing. + * + * c) a datagram containing a UID stored, and a lock fd is attached to it. In this case we allocated a dynamic + * UID and locked it in the file system, using the lock fd. + * + * As PID 1 and various children might access the socket pair simultaneously, and pop the datagram or push it + * back in any time, we also maintain a lock on the socket pair. Note one peculiarity regarding locking here: + * the UID lock on disk is protected via a BSD file lock (i.e. an fd-bound lock), so that the lock is kept in + * place as long as there's a reference to the fd open. The lock on the storage socket pair however is a POSIX + * file lock (i.e. a process-bound lock), as all users share the same fd of this (after all it is anonymous, + * nobody else could get any access to it except via our own fd) and we want to synchronize access between all + * processes that have access to it. */ + + d = hashmap_get(m->dynamic_users, name); + if (d) { + if (ret) { + /* We already have a structure for the dynamic user, let's increase the ref count and reuse it */ + d->n_ref++; + *ret = d; + } + return 0; + } + + if (!valid_user_group_name(name, VALID_USER_ALLOW_NUMERIC)) + return -EINVAL; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, storage_socket) < 0) + return -errno; + + r = dynamic_user_add(m, name, storage_socket, &d); + if (r < 0) + return r; + + storage_socket[0] = storage_socket[1] = -EBADF; + + if (ret) { + d->n_ref++; + *ret = d; + } + + return 1; +} + +static int make_uid_symlinks(uid_t uid, const char *name, bool b) { + + char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1]; + const char *path2; + int r = 0, k; + + /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The + * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it + * would be its own client then). We hence keep these world-readable symlinks in place, so that the + * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go + * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks + * on them and as those may be taken by any user with read access we can't make them world-readable. */ + + xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid); + if (unlink(path1) < 0 && errno != ENOENT) + r = -errno; + + if (b && symlink(name, path1) < 0) { + k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path1); + if (r == 0) + r = k; + } + + path2 = strjoina("/run/systemd/dynamic-uid/direct:", name); + if (unlink(path2) < 0 && errno != ENOENT) { + k = -errno; + if (r == 0) + r = k; + } + + if (b && symlink(path1 + STRLEN("/run/systemd/dynamic-uid/direct:"), path2) < 0) { + k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path2); + if (r == 0) + r = k; + } + + return r; +} + +static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) { + + /* Find a suitable free UID. We use the following strategy to find a suitable UID: + * + * 1. Initially, we try to read the UID of a number of specified paths. If any of these UIDs works, we use + * them. We use in order to increase the chance of UID reuse, if StateDirectory=, CacheDirectory= or + * LogsDirectory= are used, as reusing the UID these directories are owned by saves us from having to + * recursively chown() them to new users. + * + * 2. If that didn't yield a currently unused UID, we hash the user name, and try to use that. This should be + * pretty good, as the use ris by default derived from the unit name, and hence the same service and same + * user should usually get the same UID as long as our hashing doesn't clash. + * + * 3. Finally, if that didn't work, we randomly pick UIDs, until we find one that is empty. + * + * Since the dynamic UID space is relatively small we'll stop trying after 100 iterations, giving up. */ + + enum { + PHASE_SUGGESTED, /* the first phase, reusing directory ownership UIDs */ + PHASE_HASHED, /* the second phase, deriving a UID from the username by hashing */ + PHASE_RANDOM, /* the last phase, randomly picking UIDs */ + } phase = PHASE_SUGGESTED; + + static const uint8_t hash_key[] = { + 0x37, 0x53, 0x7e, 0x31, 0xcf, 0xce, 0x48, 0xf5, + 0x8a, 0xbb, 0x39, 0x57, 0x8d, 0xd9, 0xec, 0x59 + }; + + unsigned n_tries = 100, current_suggested = 0; + int r; + + (void) mkdir("/run/systemd/dynamic-uid", 0755); + + for (;;) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_close_ int lock_fd = -EBADF; + uid_t candidate; + ssize_t l; + + if (--n_tries <= 0) /* Give up retrying eventually */ + return -EBUSY; + + switch (phase) { + + case PHASE_SUGGESTED: { + struct stat st; + + if (!suggested_paths || !suggested_paths[current_suggested]) { + /* We reached the end of the suggested paths list, let's try by hashing the name */ + phase = PHASE_HASHED; + continue; + } + + if (stat(suggested_paths[current_suggested++], &st) < 0) + continue; /* We can't read the UID of this path, but that doesn't matter, just try the next */ + + candidate = st.st_uid; + break; + } + + case PHASE_HASHED: + /* A static user by this name does not exist yet. Let's find a free ID then, and use that. We + * start with a UID generated as hash from the user name. */ + candidate = UID_CLAMP_INTO_RANGE(siphash24(name, strlen(name), hash_key)); + + /* If this one fails, we should proceed with random tries */ + phase = PHASE_RANDOM; + break; + + case PHASE_RANDOM: + + /* Pick another random UID, and see if that works for us. */ + random_bytes(&candidate, sizeof(candidate)); + candidate = UID_CLAMP_INTO_RANGE(candidate); + break; + + default: + assert_not_reached(); + } + + /* Make sure whatever we picked here actually is in the right range */ + if (!uid_is_dynamic(candidate)) + continue; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, candidate); + + for (;;) { + struct stat st; + + lock_fd = open(lock_path, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600); + if (lock_fd < 0) + return -errno; + + r = flock(lock_fd, LOCK_EX|LOCK_NB); /* Try to get a BSD file lock on the UID lock file */ + if (r < 0) { + if (IN_SET(errno, EBUSY, EAGAIN)) + goto next; /* already in use */ + + return -errno; + } + + if (fstat(lock_fd, &st) < 0) + return -errno; + if (st.st_nlink > 0) + break; + + /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and + * got the lock. Close it, and try again. */ + lock_fd = safe_close(lock_fd); + } + + /* Some superficial check whether this UID/GID might already be taken by some static user */ + if (getpwuid(candidate) || + getgrgid((gid_t) candidate) || + search_ipc(candidate, (gid_t) candidate) != 0) { + (void) unlink(lock_path); + continue; + } + + /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */ + l = pwritev(lock_fd, + (struct iovec[2]) { + IOVEC_MAKE_STRING(name), + IOVEC_MAKE((char[1]) { '\n' }, 1), + }, 2, 0); + if (l < 0) { + r = -errno; + (void) unlink(lock_path); + return r; + } + + (void) ftruncate(lock_fd, l); + (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */ + + *ret_uid = candidate; + return TAKE_FD(lock_fd); + + next: + ; + } +} + +static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) { + uid_t uid = UID_INVALID; + struct iovec iov = IOVEC_MAKE(&uid, sizeof(uid)); + int lock_fd; + ssize_t k; + + assert(d); + assert(ret_uid); + assert(ret_lock_fd); + + /* Read the UID and lock fd that is stored in the storage AF_UNIX socket. This should be called with + * the lock on the socket taken. */ + + k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd); + if (k < 0) + return (int) k; + + *ret_uid = uid; + *ret_lock_fd = lock_fd; + + return 0; +} + +static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) { + struct iovec iov = IOVEC_MAKE(&uid, sizeof(uid)); + + assert(d); + + /* Store the UID and lock_fd in the storage socket. This should be called with the socket pair lock taken. */ + return send_one_fd_iov(d->storage_socket[1], lock_fd, &iov, 1, MSG_DONTWAIT); +} + +static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + + if (lock_fd < 0) + return; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); + (void) unlink(lock_path); + + (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */ +} + +static int dynamic_user_realize( + DynamicUser *d, + char **suggested_dirs, + uid_t *ret_uid, gid_t *ret_gid, + bool is_user) { + + _cleanup_close_ int uid_lock_fd = -EBADF; + _cleanup_close_ int etc_passwd_lock_fd = -EBADF; + uid_t num = UID_INVALID; /* a uid if is_user, and a gid otherwise */ + gid_t gid = GID_INVALID; /* a gid if is_user, ignored otherwise */ + bool flush_cache = false; + int r; + + assert(d); + assert(is_user == !!ret_uid); + assert(ret_gid); + + /* Acquire a UID for the user name. This will allocate a UID for the user name if the user doesn't exist + * yet. If it already exists its existing UID/GID will be reused. */ + + r = posix_lock(d->storage_socket[0], LOCK_EX); + if (r < 0) + return r; + + CLEANUP_POSIX_UNLOCK(d->storage_socket[0]); + + r = dynamic_user_pop(d, &num, &uid_lock_fd); + if (r < 0) { + int new_uid_lock_fd; + uid_t new_uid; + + if (r != -EAGAIN) + return r; + + /* OK, nothing stored yet, let's try to find something useful. While we are working on this release the + * lock however, so that nobody else blocks on our NSS lookups. */ + r = posix_lock(d->storage_socket[0], LOCK_UN); + if (r < 0) + return r; + + /* Let's see if a proper, static user or group by this name exists. Try to take the lock on + * /etc/passwd, if that fails with EROFS then /etc is read-only. In that case it's fine if we don't + * take the lock, given that users can't be added there anyway in this case. */ + etc_passwd_lock_fd = take_etc_passwd_lock(NULL); + if (etc_passwd_lock_fd < 0 && etc_passwd_lock_fd != -EROFS) + return etc_passwd_lock_fd; + + /* First, let's parse this as numeric UID */ + r = parse_uid(d->name, &num); + if (r < 0) { + struct passwd *p; + struct group *g; + + if (is_user) { + /* OK, this is not a numeric UID. Let's see if there's a user by this name */ + p = getpwnam(d->name); + if (p) { + num = p->pw_uid; + gid = p->pw_gid; + } else { + /* if the user does not exist but the group with the same name exists, refuse operation */ + g = getgrnam(d->name); + if (g) + return -EILSEQ; + } + } else { + /* Let's see if there's a group by this name */ + g = getgrnam(d->name); + if (g) + num = (uid_t) g->gr_gid; + else { + /* if the group does not exist but the user with the same name exists, refuse operation */ + p = getpwnam(d->name); + if (p) + return -EILSEQ; + } + } + } + + if (num == UID_INVALID) { + /* No static UID assigned yet, excellent. Let's pick a new dynamic one, and lock it. */ + + uid_lock_fd = pick_uid(suggested_dirs, d->name, &num); + if (uid_lock_fd < 0) + return uid_lock_fd; + } + + /* So, we found a working UID/lock combination. Let's see if we actually still need it. */ + r = posix_lock(d->storage_socket[0], LOCK_EX); + if (r < 0) { + unlink_uid_lock(uid_lock_fd, num, d->name); + return r; + } + + r = dynamic_user_pop(d, &new_uid, &new_uid_lock_fd); + if (r < 0) { + if (r != -EAGAIN) { + /* OK, something bad happened, let's get rid of the bits we acquired. */ + unlink_uid_lock(uid_lock_fd, num, d->name); + return r; + } + + /* Great! Nothing is stored here, still. Store our newly acquired data. */ + flush_cache = true; + } else { + /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we + * acquired, and use what's stored now. */ + + unlink_uid_lock(uid_lock_fd, num, d->name); + safe_close(uid_lock_fd); + + num = new_uid; + uid_lock_fd = new_uid_lock_fd; + } + } else if (is_user && !uid_is_dynamic(num)) { + struct passwd *p; + + /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */ + errno = 0; + p = getpwuid(num); + if (!p) + return errno_or_else(ESRCH); + + gid = p->pw_gid; + } + + /* If the UID/GID was already allocated dynamically, push the data we popped out back in. If it was already + * allocated statically, push the UID back too, but do not push the lock fd in. If we allocated the UID + * dynamically right here, push that in along with the lock fd for it. */ + r = dynamic_user_push(d, num, uid_lock_fd); + if (r < 0) + return r; + + if (flush_cache) { + /* If we allocated a new dynamic UID, refresh nscd, so that it forgets about potentially cached + * negative entries. But let's do so after we release the /etc/passwd lock, so that there's no + * potential for nscd wanting to lock that for completing the invalidation. */ + etc_passwd_lock_fd = safe_close(etc_passwd_lock_fd); + (void) nscd_flush_cache(STRV_MAKE("passwd", "group")); + } + + if (is_user) { + *ret_uid = num; + *ret_gid = gid != GID_INVALID ? gid : num; + } else + *ret_gid = num; + + return 0; +} + +int dynamic_user_current(DynamicUser *d, uid_t *ret) { + _cleanup_close_ int lock_fd = -EBADF; + uid_t uid; + int r; + + assert(d); + + /* Get the currently assigned UID for the user, if there's any. This simply pops the data from the + * storage socket, and pushes it back in right-away. */ + + r = posix_lock(d->storage_socket[0], LOCK_EX); + if (r < 0) + return r; + + CLEANUP_POSIX_UNLOCK(d->storage_socket[0]); + + r = dynamic_user_pop(d, &uid, &lock_fd); + if (r < 0) + return r; + + r = dynamic_user_push(d, uid, lock_fd); + if (r < 0) + return r; + + if (ret) + *ret = uid; + + return 0; +} + +static DynamicUser* dynamic_user_unref(DynamicUser *d) { + if (!d) + return NULL; + + /* Note that this doesn't actually release any resources itself. If a dynamic user should be fully + * destroyed and its UID released, use dynamic_user_destroy() instead. NB: the dynamic user table may + * contain entries with no references, which is commonly the case right before a daemon reload. */ + + assert(d->n_ref > 0); + d->n_ref--; + + return NULL; +} + +static int dynamic_user_close(DynamicUser *d) { + _cleanup_close_ int lock_fd = -EBADF; + uid_t uid; + int r; + + /* Release the user ID, by releasing the lock on it, and emptying the storage socket. After this the + * user is unrealized again, much like it was after it the DynamicUser object was first allocated. */ + + r = posix_lock(d->storage_socket[0], LOCK_EX); + if (r < 0) + return r; + + CLEANUP_POSIX_UNLOCK(d->storage_socket[0]); + + r = dynamic_user_pop(d, &uid, &lock_fd); + if (r == -EAGAIN) + /* User wasn't realized yet, nothing to do. */ + return 0; + if (r < 0) + return r; + + /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */ + unlink_uid_lock(lock_fd, uid, d->name); + + (void) nscd_flush_cache(STRV_MAKE("passwd", "group")); + return 1; +} + +static DynamicUser* dynamic_user_destroy(DynamicUser *d) { + if (!d) + return NULL; + + /* Drop a reference to a DynamicUser object, and destroy the user completely if this was the last + * reference. This is called whenever a service is shut down and wants its dynamic UID gone. Note that + * dynamic_user_unref() is what is called whenever a service is simply freed, for example during a reload + * cycle, where the dynamic users should not be destroyed, but our datastructures should. */ + + dynamic_user_unref(d); + + if (d->n_ref > 0) + return NULL; + + (void) dynamic_user_close(d); + return dynamic_user_free(d); +} + +int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds) { + int copy0, copy1; + + assert(key); + assert(f); + assert(fds); + + if (!d) + return 0; + + if (d->storage_socket[0] < 0 || d->storage_socket[1] < 0) + return 0; + + copy0 = fdset_put_dup(fds, d->storage_socket[0]); + if (copy0 < 0) + return log_error_errno(copy0, "Failed to add dynamic user storage fd to serialization: %m"); + + copy1 = fdset_put_dup(fds, d->storage_socket[1]); + if (copy1 < 0) + return log_error_errno(copy1, "Failed to add dynamic user storage fd to serialization: %m"); + + (void) serialize_item_format(f, key, "%s %i %i", d->name, copy0, copy1); + + return 0; +} + +int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds) { + DynamicUser *d; + + assert(m); + + /* Dump the dynamic user database into the manager serialization, to deal with daemon reloads. */ + + HASHMAP_FOREACH(d, m->dynamic_users) + (void) dynamic_user_serialize_one(d, "dynamic-user", f, fds); + + return 0; +} + +void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret) { + _cleanup_free_ char *name = NULL, *s0 = NULL, *s1 = NULL; + _cleanup_close_ int fd0 = -EBADF, fd1 = -EBADF; + int r; + + assert(value); + assert(fds); + + /* Parse the serialization again, after a daemon reload */ + + r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL); + if (r != 3 || !isempty(value)) { + log_debug("Unable to parse dynamic user line."); + return; + } + + fd0 = deserialize_fd(fds, s0); + if (fd0 < 0) + return; + + fd1 = deserialize_fd(fds, s1); + if (fd1 < 0) + return; + + r = dynamic_user_add(m, name, (int[]) { fd0, fd1 }, ret); + if (r < 0) { + log_debug_errno(r, "Failed to add dynamic user: %m"); + return; + } + + TAKE_FD(fd0); + TAKE_FD(fd1); + + if (ret) /* If the caller uses it directly, increment the refcount */ + (*ret)->n_ref++; +} + +void dynamic_user_vacuum(Manager *m, bool close_user) { + DynamicUser *d; + + assert(m); + + /* Empty the dynamic user database, optionally cleaning up orphaned dynamic users, i.e. destroy and free users + * to which no reference exist. This is called after a daemon reload finished, in order to destroy users which + * might not be referenced anymore. */ + + HASHMAP_FOREACH(d, m->dynamic_users) { + if (d->n_ref > 0) + continue; + + if (close_user) { + log_debug("Removing orphaned dynamic user %s", d->name); + (void) dynamic_user_close(d); + } + + dynamic_user_free(d); + } +} + +int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_free_ char *user = NULL; + uid_t check_uid; + int r; + + assert(m); + assert(ret); + + /* A friendly way to translate a dynamic user's UID into a name. */ + if (!uid_is_dynamic(uid)) + return -ESRCH; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); + r = read_one_line_file(lock_path, &user); + if (IN_SET(r, -ENOENT, 0)) + return -ESRCH; + if (r < 0) + return r; + + /* The lock file might be stale, hence let's verify the data before we return it */ + r = dynamic_user_lookup_name(m, user, &check_uid); + if (r < 0) + return r; + if (check_uid != uid) /* lock file doesn't match our own idea */ + return -ESRCH; + + *ret = TAKE_PTR(user); + + return 0; +} + +int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) { + DynamicUser *d; + int r; + + assert(m); + assert(name); + + /* A friendly call for translating a dynamic user's name into its UID */ + + d = hashmap_get(m->dynamic_users, name); + if (!d) + return -ESRCH; + + r = dynamic_user_current(d, ret); + if (r == -EAGAIN) /* not realized yet? */ + return -ESRCH; + + return r; +} + +int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret) { + _cleanup_(dynamic_creds_unrefp) DynamicCreds *creds = NULL; + bool acquired = false; + int r; + + assert(m); + assert(ret); + + if (!user && !group) { + *ret = NULL; + return 0; + } + + creds = new0(DynamicCreds, 1); + if (!creds) + return -ENOMEM; + + /* A DynamicUser object encapsulates an allocation of both a UID and a GID for a specific name. However, some + * services use different user and groups. For cases like that there's DynamicCreds containing a pair of user + * and group. This call allocates a pair. */ + + if (user) { + r = dynamic_user_acquire(m, user, &creds->user); + if (r < 0) + return r; + + acquired = true; + } + + if (creds->user && (!group || streq_ptr(user, group))) + creds->group = dynamic_user_ref(creds->user); + else if (group) { + r = dynamic_user_acquire(m, group, &creds->group); + if (r < 0) { + if (acquired) + creds->user = dynamic_user_unref(creds->user); + return r; + } + } + + *ret = TAKE_PTR(creds); + + return 0; +} + +int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid) { + uid_t u = UID_INVALID; + gid_t g = GID_INVALID; + int r; + + assert(creds); + assert(uid); + assert(gid); + + /* Realize both the referenced user and group */ + + if (creds->user) { + r = dynamic_user_realize(creds->user, suggested_paths, &u, &g, true); + if (r < 0) + return r; + } + + if (creds->group && creds->group != creds->user) { + r = dynamic_user_realize(creds->group, suggested_paths, NULL, &g, false); + if (r < 0) + return r; + } + + *uid = u; + *gid = g; + return 0; +} + +DynamicCreds* dynamic_creds_unref(DynamicCreds *creds) { + if (!creds) + return NULL; + + creds->user = dynamic_user_unref(creds->user); + creds->group = dynamic_user_unref(creds->group); + + return mfree(creds); +} + +DynamicCreds* dynamic_creds_destroy(DynamicCreds *creds) { + if (!creds) + return NULL; + + creds->user = dynamic_user_destroy(creds->user); + creds->group = dynamic_user_destroy(creds->group); + + return mfree(creds); +} + +void dynamic_creds_done(DynamicCreds *creds) { + if (!creds) + return; + + if (creds->group != creds->user) + dynamic_user_free(creds->group); + creds->group = creds->user = dynamic_user_free(creds->user); +} + +void dynamic_creds_close(DynamicCreds *creds) { + if (!creds) + return; + + if (creds->user) + safe_close_pair(creds->user->storage_socket); + + if (creds->group && creds->group != creds->user) + safe_close_pair(creds->group->storage_socket); +} diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h new file mode 100644 index 0000000..303a7d0 --- /dev/null +++ b/src/core/dynamic-user.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct DynamicUser DynamicUser; + +typedef struct DynamicCreds { + /* A combination of a dynamic user and group */ + DynamicUser *user; + DynamicUser *group; +} DynamicCreds; + +#include "manager.h" + +/* Note that this object always allocates a pair of user and group under the same name, even if one of them isn't + * used. This means, if you want to allocate a group and user pair, and they might have two different names, then you + * need to allocated two of these objects. DynamicCreds below makes that easy. */ +struct DynamicUser { + Manager *manager; + unsigned n_ref; + + /* An AF_UNIX socket pair that contains a datagram containing both the numeric ID assigned, as well as a lock + * file fd locking the user ID we picked. */ + int storage_socket[2]; + + char name[]; +}; + +int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds); +int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds); +void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret); +DynamicUser* dynamic_user_free(DynamicUser *d); +void dynamic_user_vacuum(Manager *m, bool close_user); + +int dynamic_user_current(DynamicUser *d, uid_t *ret); +int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret); +int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret); + +int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret); +int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid); + +DynamicCreds *dynamic_creds_unref(DynamicCreds *creds); +DynamicCreds *dynamic_creds_destroy(DynamicCreds *creds); +void dynamic_creds_done(DynamicCreds *creds); +void dynamic_creds_close(DynamicCreds *creds); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_destroy); + +DynamicUser *dynamic_user_ref(DynamicUser *user); diff --git a/src/core/efi-random.c b/src/core/efi-random.c new file mode 100644 index 0000000..dffde57 --- /dev/null +++ b/src/core/efi-random.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "chattr-util.h" +#include "efi-random.h" +#include "efivars.h" +#include "fd-util.h" +#include "fs-util.h" +#include "random-util.h" +#include "strv.h" + +void lock_down_efi_variables(void) { + _cleanup_close_ int fd = -EBADF; + int r; + + fd = open(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderSystemToken)), O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Unable to open LoaderSystemToken EFI variable, ignoring: %m"); + return; + } + + /* Paranoia: let's restrict access modes of these a bit, so that unprivileged users can't use them to + * identify the system or gain too much insight into what we might have credited to the entropy + * pool. */ + r = chattr_fd(fd, 0, FS_IMMUTABLE_FL, NULL); + if (r < 0) + log_warning_errno(r, "Failed to drop FS_IMMUTABLE_FL from LoaderSystemToken EFI variable, ignoring: %m"); + if (fchmod(fd, 0600) < 0) + log_warning_errno(errno, "Failed to reduce access mode of LoaderSystemToken EFI variable, ignoring: %m"); +} diff --git a/src/core/efi-random.h b/src/core/efi-random.h new file mode 100644 index 0000000..87166c9 --- /dev/null +++ b/src/core/efi-random.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +void lock_down_efi_variables(void); diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c new file mode 100644 index 0000000..e2cd931 --- /dev/null +++ b/src/core/emergency-action.c @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "bus-error.h" +#include "bus-util.h" +#include "emergency-action.h" +#include "raw-reboot.h" +#include "reboot-util.h" +#include "special.h" +#include "string-table.h" +#include "terminal-util.h" +#include "virt.h" + +static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = { + [EMERGENCY_ACTION_NONE] = "none", + [EMERGENCY_ACTION_REBOOT] = "reboot", + [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force", + [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate", + [EMERGENCY_ACTION_POWEROFF] = "poweroff", + [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force", + [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate", + [EMERGENCY_ACTION_EXIT] = "exit", + [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force", + [EMERGENCY_ACTION_SOFT_REBOOT] = "soft-reboot", + [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] = "soft-reboot-force", + [EMERGENCY_ACTION_KEXEC] = "kexec", + [EMERGENCY_ACTION_KEXEC_FORCE] = "kexec-force", + [EMERGENCY_ACTION_HALT] = "halt", + [EMERGENCY_ACTION_HALT_FORCE] = "halt-force", + [EMERGENCY_ACTION_HALT_IMMEDIATE] = "halt-immediate", +}; + +static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) { + log_full(warn ? LOG_WARNING : LOG_DEBUG, "%s: %s", message, reason); + if (warn) + manager_status_printf(m, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, + "%s: %s", message, reason); +} + +void emergency_action( + Manager *m, + EmergencyAction action, + EmergencyActionFlags options, + const char *reboot_arg, + int exit_status, + const char *reason) { + + Unit *u; + + assert(m); + assert(action >= 0); + assert(action < _EMERGENCY_ACTION_MAX); + + /* Is the special shutdown target active or queued? If so, we are in shutdown state */ + if (IN_SET(action, + EMERGENCY_ACTION_REBOOT, + EMERGENCY_ACTION_SOFT_REBOOT, + EMERGENCY_ACTION_POWEROFF, + EMERGENCY_ACTION_EXIT, + EMERGENCY_ACTION_KEXEC, + EMERGENCY_ACTION_HALT)) { + u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET); + if (u && unit_active_or_pending(u)) { + log_notice("Shutdown is already active. Skipping emergency action request %s.", + emergency_action_table[action]); + return; + } + } + + if (action == EMERGENCY_ACTION_NONE) + return; + + if (FLAGS_SET(options, EMERGENCY_ACTION_IS_WATCHDOG) && !m->service_watchdogs) { + log_warning("Watchdog disabled! Not acting on: %s", reason); + return; + } + + bool warn = FLAGS_SET(options, EMERGENCY_ACTION_WARN); + + switch (action) { + + case EMERGENCY_ACTION_REBOOT: + log_and_status(m, warn, "Rebooting", reason); + + (void) update_reboot_parameter_and_warn(reboot_arg, true); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_REBOOT_FORCE: + log_and_status(m, warn, "Forcibly rebooting", reason); + + (void) update_reboot_parameter_and_warn(reboot_arg, true); + m->objective = MANAGER_REBOOT; + break; + + case EMERGENCY_ACTION_REBOOT_IMMEDIATE: + log_and_status(m, warn, "Rebooting immediately", reason); + + sync(); + + if (!isempty(reboot_arg)) { + log_info("Rebooting with argument '%s'.", reboot_arg); + (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, reboot_arg); + log_warning_errno(errno, "Failed to reboot with parameter, retrying without: %m"); + } + + log_info("Rebooting."); + (void) reboot(RB_AUTOBOOT); + break; + + case EMERGENCY_ACTION_SOFT_REBOOT: + log_and_status(m, warn, "Soft-rebooting", reason); + + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_SOFT_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_SOFT_REBOOT_FORCE: + log_and_status(m, warn, "Forcibly soft-rebooting", reason); + + m->objective = MANAGER_SOFT_REBOOT; + break; + + case EMERGENCY_ACTION_EXIT: + + if (exit_status >= 0) + m->return_value = exit_status; + + if (MANAGER_IS_USER(m) || detect_container() > 0) { + log_and_status(m, warn, "Exiting", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + } + + log_notice("Doing \"poweroff\" action instead of an \"exit\" emergency action."); + _fallthrough_; + + case EMERGENCY_ACTION_POWEROFF: + log_and_status(m, warn, "Powering off", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_EXIT_FORCE: + + if (exit_status >= 0) + m->return_value = exit_status; + + if (MANAGER_IS_USER(m) || detect_container() > 0) { + log_and_status(m, warn, "Exiting immediately", reason); + m->objective = MANAGER_EXIT; + break; + } + + log_notice("Doing \"poweroff-force\" action instead of an \"exit-force\" emergency action."); + _fallthrough_; + + case EMERGENCY_ACTION_POWEROFF_FORCE: + log_and_status(m, warn, "Forcibly powering off", reason); + m->objective = MANAGER_POWEROFF; + break; + + case EMERGENCY_ACTION_POWEROFF_IMMEDIATE: + log_and_status(m, warn, "Powering off immediately", reason); + + sync(); + + log_info("Powering off."); + (void) reboot(RB_POWER_OFF); + break; + + case EMERGENCY_ACTION_KEXEC: + log_and_status(m, warn, "Executing kexec", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_KEXEC_FORCE: + log_and_status(m, warn, "Forcibly executing kexec", reason); + m->objective = MANAGER_KEXEC; + break; + + case EMERGENCY_ACTION_HALT: + log_and_status(m, warn, "Halting", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_HALT_FORCE: + log_and_status(m, warn, "Forcibly halting", reason); + m->objective = MANAGER_HALT; + break; + + case EMERGENCY_ACTION_HALT_IMMEDIATE: + log_and_status(m, warn, "Halting immediately", reason); + + sync(); + + log_info("Halting."); + (void) reboot(RB_HALT_SYSTEM); + break; + + default: + assert_not_reached(); + } +} + +DEFINE_STRING_TABLE_LOOKUP(emergency_action, EmergencyAction); + +int parse_emergency_action( + const char *value, + RuntimeScope runtime_scope, + EmergencyAction *ret) { + + EmergencyAction x; + + x = emergency_action_from_string(value); + if (x < 0) + return -EINVAL; + + if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION) + return -EOPNOTSUPP; + + *ret = x; + return 0; +} diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h new file mode 100644 index 0000000..33e0ec6 --- /dev/null +++ b/src/core/emergency-action.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "runtime-scope.h" + +typedef enum EmergencyAction { + EMERGENCY_ACTION_NONE, + EMERGENCY_ACTION_REBOOT, + EMERGENCY_ACTION_REBOOT_FORCE, + EMERGENCY_ACTION_REBOOT_IMMEDIATE, + EMERGENCY_ACTION_POWEROFF, + EMERGENCY_ACTION_POWEROFF_FORCE, + EMERGENCY_ACTION_POWEROFF_IMMEDIATE, + EMERGENCY_ACTION_EXIT, + _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT, + EMERGENCY_ACTION_EXIT_FORCE, + EMERGENCY_ACTION_SOFT_REBOOT, + EMERGENCY_ACTION_SOFT_REBOOT_FORCE, + EMERGENCY_ACTION_KEXEC, + EMERGENCY_ACTION_KEXEC_FORCE, + EMERGENCY_ACTION_HALT, + EMERGENCY_ACTION_HALT_FORCE, + EMERGENCY_ACTION_HALT_IMMEDIATE, + _EMERGENCY_ACTION_MAX, + _EMERGENCY_ACTION_INVALID = -EINVAL, +} EmergencyAction; + +typedef enum EmergencyActionFlags { + EMERGENCY_ACTION_IS_WATCHDOG = 1 << 0, + EMERGENCY_ACTION_WARN = 1 << 1, +} EmergencyActionFlags; + +#include "macro.h" +#include "manager.h" + +void emergency_action(Manager *m, + EmergencyAction action, EmergencyActionFlags options, + const char *reboot_arg, int exit_status, const char *reason); + +const char* emergency_action_to_string(EmergencyAction i) _const_; +EmergencyAction emergency_action_from_string(const char *s) _pure_; + +int parse_emergency_action(const char *value, RuntimeScope runtime_scope, EmergencyAction *ret); diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c new file mode 100644 index 0000000..6bcfb68 --- /dev/null +++ b/src/core/exec-credential.c @@ -0,0 +1,1023 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "acl-util.h" +#include "creds-util.h" +#include "exec-credential.h" +#include "execute.h" +#include "fileio.h" +#include "glob-util.h" +#include "io-util.h" +#include "label-util.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "process-util.h" +#include "random-util.h" +#include "recurse-dir.h" +#include "rm-rf.h" +#include "tmpfile-util.h" + +ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) { + if (!sc) + return NULL; + + free(sc->id); + free(sc->data); + return mfree(sc); +} + +ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) { + if (!lc) + return NULL; + + free(lc->id); + free(lc->path); + return mfree(lc); +} + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + exec_set_credential_hash_ops, + char, string_hash_func, string_compare_func, + ExecSetCredential, exec_set_credential_free); + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + exec_load_credential_hash_ops, + char, string_hash_func, string_compare_func, + ExecLoadCredential, exec_load_credential_free); + +bool exec_context_has_credentials(const ExecContext *c) { + assert(c); + + return !hashmap_isempty(c->set_credentials) || + !hashmap_isempty(c->load_credentials) || + !set_isempty(c->import_credentials); +} + +bool exec_context_has_encrypted_credentials(ExecContext *c) { + ExecLoadCredential *load_cred; + ExecSetCredential *set_cred; + + assert(c); + + HASHMAP_FOREACH(load_cred, c->load_credentials) + if (load_cred->encrypted) + return true; + + HASHMAP_FOREACH(set_cred, c->set_credentials) + if (set_cred->encrypted) + return true; + + return false; +} + +static int get_credential_directory( + const char *runtime_prefix, + const char *unit, + char **ret) { + + char *p; + + assert(ret); + + if (!runtime_prefix || !unit) { + *ret = NULL; + return 0; + } + + p = path_join(runtime_prefix, "credentials", unit); + if (!p) + return -ENOMEM; + + *ret = p; + return 1; +} + +int exec_context_get_credential_directory( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + char **ret) { + + assert(context); + assert(params); + assert(unit); + assert(ret); + + if (!exec_context_has_credentials(context)) { + *ret = NULL; + return 0; + } + + return get_credential_directory(params->prefix[EXEC_DIRECTORY_RUNTIME], unit, ret); +} + +int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c) { + _cleanup_free_ char *p = NULL, *m = NULL; + int r; + + assert(u); + assert(c); + + if (!exec_context_has_credentials(c)) + return 0; + + /* Let's make sure the credentials directory of this service is unmounted *after* the service itself + * shuts down. This only matters if mount namespacing is not used for the service, and hence the + * credentials mount appears on the host. */ + + r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p); + if (r <= 0) + return r; + + r = unit_name_from_path(p, ".mount", &m); + if (r < 0) + return r; + + return unit_add_dependency_by_name(u, UNIT_AFTER, m, /* add_reference= */ true, UNIT_DEPENDENCY_FILE); +} + +int exec_context_destroy_credentials(Unit *u) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p); + if (r <= 0) + return r; + + /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to + * unmount it, and afterwards remove the mount point */ + if (umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW) >= 0) + (void) mount_invalidate_state_by_path(u->manager, p); + + (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD); + + return 0; +} + +static int write_credential( + int dfd, + const char *id, + const void *data, + size_t size, + uid_t uid, + gid_t gid, + bool ownership_ok) { + + _cleanup_(unlink_and_freep) char *tmp = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + r = tempfn_random_child("", "cred", &tmp); + if (r < 0) + return r; + + fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600); + if (fd < 0) { + tmp = mfree(tmp); + return -errno; + } + + r = loop_write(fd, data, size); + if (r < 0) + return r; + + if (fchmod(fd, 0400) < 0) /* Take away "w" bit */ + return -errno; + + if (uid_is_valid(uid) && uid != getuid()) { + r = fd_add_uid_acl_permission(fd, uid, ACL_READ); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + + if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want + * to express: that the user gets read access and nothing + * else. But if the backing fs can't support that (e.g. ramfs) + * then we can use file ownership instead. But that's only safe if + * we can then re-mount the whole thing read-only, so that the + * user can no longer chmod() the file to gain write access. */ + return r; + + if (fchown(fd, uid, gid) < 0) + return -errno; + } + } + + if (renameat(dfd, tmp, dfd, id) < 0) + return -errno; + + tmp = mfree(tmp); + return 0; +} + +typedef enum CredentialSearchPath { + CREDENTIAL_SEARCH_PATH_TRUSTED, + CREDENTIAL_SEARCH_PATH_ENCRYPTED, + CREDENTIAL_SEARCH_PATH_ALL, + _CREDENTIAL_SEARCH_PATH_MAX, + _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL, +} CredentialSearchPath; + +static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) { + + _cleanup_strv_free_ char **l = NULL; + + assert(params); + assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX); + + /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in + * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted + * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */ + + if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) { + if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0) + return NULL; + + if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0) + return NULL; + } + + if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) { + if (params->received_credentials_directory) + if (strv_extend(&l, params->received_credentials_directory) < 0) + return NULL; + + if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0) + return NULL; + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *t = strv_join(l, ":"); + + log_debug("Credential search path is: %s", strempty(t)); + } + + return TAKE_PTR(l); +} + +static int maybe_decrypt_and_write_credential( + int dir_fd, + const char *id, + bool encrypted, + uid_t uid, + gid_t gid, + bool ownership_ok, + const char *data, + size_t size, + uint64_t *left) { + + _cleanup_free_ void *plaintext = NULL; + size_t add; + int r; + + if (encrypted) { + size_t plaintext_size = 0; + + r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, + &plaintext, &plaintext_size); + if (r < 0) + return r; + + data = plaintext; + size = plaintext_size; + } + + add = strlen(id) + size; + if (add > *left) + return -E2BIG; + + r = write_credential(dir_fd, id, data, size, uid, gid, ownership_ok); + if (r < 0) + return log_debug_errno(r, "Failed to write credential '%s': %m", id); + + *left -= add; + return 0; +} + +static int load_credential_glob( + const char *path, + bool encrypted, + char **search_path, + ReadFullFileFlags flags, + int write_dfd, + uid_t uid, + gid_t gid, + bool ownership_ok, + uint64_t *left) { + + int r; + + STRV_FOREACH(d, search_path) { + _cleanup_globfree_ glob_t pglob = {}; + _cleanup_free_ char *j = NULL; + + j = path_join(*d, path); + if (!j) + return -ENOMEM; + + r = safe_glob(j, 0, &pglob); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + + for (size_t n = 0; n < pglob.gl_pathc; n++) { + _cleanup_free_ char *fn = NULL; + _cleanup_(erase_and_freep) char *data = NULL; + size_t size; + + /* path is absolute, hence pass AT_FDCWD as nop dir fd here */ + r = read_full_file_full( + AT_FDCWD, + pglob.gl_pathv[n], + UINT64_MAX, + encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX, + flags, + NULL, + &data, &size); + if (r < 0) + return log_debug_errno(r, "Failed to read credential '%s': %m", + pglob.gl_pathv[n]); + + r = path_extract_filename(pglob.gl_pathv[n], &fn); + if (r < 0) + return log_debug_errno(r, "Failed to extract filename from '%s': %m", + pglob.gl_pathv[n]); + + r = maybe_decrypt_and_write_credential( + write_dfd, + fn, + encrypted, + uid, + gid, + ownership_ok, + data, size, + left); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + } + + return 0; +} + +static int load_credential( + const ExecContext *context, + const ExecParameters *params, + const char *id, + const char *path, + bool encrypted, + const char *unit, + int read_dfd, + int write_dfd, + uid_t uid, + gid_t gid, + bool ownership_ok, + uint64_t *left) { + + ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER; + _cleanup_strv_free_ char **search_path = NULL; + _cleanup_(erase_and_freep) char *data = NULL; + _cleanup_free_ char *bindname = NULL; + const char *source = NULL; + bool missing_ok = true; + size_t size, maxsz; + int r; + + assert(context); + assert(params); + assert(id); + assert(path); + assert(unit); + assert(read_dfd >= 0 || read_dfd == AT_FDCWD); + assert(write_dfd >= 0); + assert(left); + + if (read_dfd >= 0) { + /* If a directory fd is specified, then read the file directly from that dir. In this case we + * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX + * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to + * open it. */ + + if (!filename_is_valid(path)) /* safety check */ + return -EINVAL; + + missing_ok = true; + source = path; + + } else if (path_is_absolute(path)) { + /* If this is an absolute path, read the data directly from it, and support AF_UNIX + * sockets */ + + if (!path_is_valid(path)) /* safety check */ + return -EINVAL; + + flags |= READ_FULL_FILE_CONNECT_SOCKET; + + /* Pass some minimal info about the unit and the credential name we are looking to acquire + * via the source socket address in case we read off an AF_UNIX socket. */ + if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0) + return -ENOMEM; + + missing_ok = false; + source = path; + + } else if (credential_name_valid(path)) { + /* If this is a relative path, take it as credential name relative to the credentials + * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we + * are operating on a credential store, i.e. this is guaranteed to be regular files. */ + + search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL); + if (!search_path) + return -ENOMEM; + + missing_ok = true; + } else + source = NULL; + + if (encrypted) + flags |= READ_FULL_FILE_UNBASE64; + + maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX; + + if (search_path) { + STRV_FOREACH(d, search_path) { + _cleanup_free_ char *j = NULL; + + j = path_join(*d, path); + if (!j) + return -ENOMEM; + + r = read_full_file_full( + AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */ + UINT64_MAX, + maxsz, + flags, + NULL, + &data, &size); + if (r != -ENOENT) + break; + } + } else if (source) + r = read_full_file_full( + read_dfd, source, + UINT64_MAX, + maxsz, + flags, + bindname, + &data, &size); + else + r = -ENOENT; + + if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) { + /* Make a missing inherited credential non-fatal, let's just continue. After all apps + * will get clear errors if we don't pass such a missing credential on as they + * themselves will get ENOENT when trying to read them, which should not be much + * worse than when we handle the error here and make it fatal. + * + * Also, if the source file doesn't exist, but a fallback is set via SetCredentials= + * we are fine, too. */ + log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path); + return 0; + } + if (r < 0) + return log_debug_errno(r, "Failed to read credential '%s': %m", path); + + return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, gid, ownership_ok, data, size, left); +} + +struct load_cred_args { + const ExecContext *context; + const ExecParameters *params; + bool encrypted; + const char *unit; + int dfd; + uid_t uid; + gid_t gid; + bool ownership_ok; + uint64_t *left; +}; + +static int load_cred_recurse_dir_cb( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + struct load_cred_args *args = ASSERT_PTR(userdata); + _cleanup_free_ char *sub_id = NULL; + int r; + + if (event != RECURSE_DIR_ENTRY) + return RECURSE_DIR_CONTINUE; + + if (!IN_SET(de->d_type, DT_REG, DT_SOCK)) + return RECURSE_DIR_CONTINUE; + + sub_id = strreplace(path, "/", "_"); + if (!sub_id) + return -ENOMEM; + + if (!credential_name_valid(sub_id)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id); + + if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) { + log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path); + return RECURSE_DIR_CONTINUE; + } + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id); + + r = load_credential( + args->context, + args->params, + sub_id, + de->d_name, + args->encrypted, + args->unit, + dir_fd, + args->dfd, + args->uid, + args->gid, + args->ownership_ok, + args->left); + if (r < 0) + return r; + + return RECURSE_DIR_CONTINUE; +} + +static int acquire_credentials( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + const char *p, + uid_t uid, + gid_t gid, + bool ownership_ok) { + + uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX; + _cleanup_close_ int dfd = -EBADF; + const char *ic; + ExecLoadCredential *lc; + ExecSetCredential *sc; + int r; + + assert(context); + assert(p); + + dfd = open(p, O_DIRECTORY|O_CLOEXEC); + if (dfd < 0) + return -errno; + + r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */ + if (r < 0) + return r; + + /* First, load credentials off disk (or acquire via AF_UNIX socket) */ + HASHMAP_FOREACH(lc, context->load_credentials) { + _cleanup_close_ int sub_fd = -EBADF; + + /* If this is an absolute path, then try to open it as a directory. If that works, then we'll + * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as + * a regular file. Finally, if it's a relative path we will use it as a credential name to + * propagate a credential passed to us from further up. */ + + if (path_is_absolute(lc->path)) { + sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY); + if (sub_fd < 0 && !IN_SET(errno, + ENOTDIR, /* Not a directory */ + ENOENT)) /* Doesn't exist? */ + return log_debug_errno(errno, "Failed to open '%s': %m", lc->path); + } + + if (sub_fd < 0) + /* Regular file (incl. a credential passed in from higher up) */ + r = load_credential( + context, + params, + lc->id, + lc->path, + lc->encrypted, + unit, + AT_FDCWD, + dfd, + uid, + gid, + ownership_ok, + &left); + else + /* Directory */ + r = recurse_dir( + sub_fd, + /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */ + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE, + load_cred_recurse_dir_cb, + &(struct load_cred_args) { + .context = context, + .params = params, + .encrypted = lc->encrypted, + .unit = unit, + .dfd = dfd, + .uid = uid, + .gid = gid, + .ownership_ok = ownership_ok, + .left = &left, + }); + if (r < 0) + return r; + } + + /* Next, look for system credentials and credentials in the credentials store. Note that these do not + * override any credentials found earlier. */ + SET_FOREACH(ic, context->import_credentials) { + _cleanup_free_ char **search_path = NULL; + + search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED); + if (!search_path) + return -ENOMEM; + + r = load_credential_glob( + ic, + /* encrypted = */ false, + search_path, + READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER, + dfd, + uid, + gid, + ownership_ok, + &left); + if (r < 0) + return r; + + search_path = strv_free(search_path); + search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED); + if (!search_path) + return -ENOMEM; + + r = load_credential_glob( + ic, + /* encrypted = */ true, + search_path, + READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64, + dfd, + uid, + gid, + ownership_ok, + &left); + if (r < 0) + return r; + } + + /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not + * add them, so that they can act as a "default" if the same credential is specified multiple times. */ + HASHMAP_FOREACH(sc, context->set_credentials) { + _cleanup_(erase_and_freep) void *plaintext = NULL; + const char *data; + size_t size, add; + + /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return + * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda + * slow and involved, hence it's nice to be able to skip that if the credential already + * exists anyway. */ + if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) + continue; + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id); + + if (sc->encrypted) { + r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size); + if (r < 0) + return r; + + data = plaintext; + } else { + data = sc->data; + size = sc->size; + } + + add = strlen(sc->id) + size; + if (add > left) + return -E2BIG; + + r = write_credential(dfd, sc->id, data, size, uid, gid, ownership_ok); + if (r < 0) + return r; + + left -= add; + } + + r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */ + if (r < 0) + return r; + + /* After we created all keys with the right perms, also make sure the credential store as a whole is + * accessible */ + + if (uid_is_valid(uid) && uid != getuid()) { + r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + + if (!ownership_ok) + return r; + + if (fchown(dfd, uid, gid) < 0) + return -errno; + } + } + + return 0; +} + +static int setup_credentials_internal( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + const char *final, /* This is where the credential store shall eventually end up at */ + const char *workspace, /* This is where we can prepare it before moving it to the final place */ + bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */ + bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */ + uid_t uid, + gid_t gid) { + + int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true + * if we mounted something; false if we definitely can't mount anything */ + bool final_mounted; + const char *where; + + assert(context); + assert(final); + assert(workspace); + + if (reuse_workspace) { + r = path_is_mount_point(workspace, NULL, 0); + if (r < 0) + return r; + if (r > 0) + workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse + * it, let's keep this in mind */ + else + workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */ + } else + workspace_mounted = -1; /* ditto */ + + r = path_is_mount_point(final, NULL, 0); + if (r < 0) + return r; + if (r > 0) { + /* If the final place already has something mounted, we use that. If the workspace also has + * something mounted we assume it's actually the same mount (but with MS_RDONLY + * different). */ + final_mounted = true; + + if (workspace_mounted < 0) { + /* If the final place is mounted, but the workspace isn't, then let's bind mount + * the final version to the workspace, and make it writable, so that we can make + * changes */ + + r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL); + if (r < 0) + return r; + + workspace_mounted = true; + } + } else + final_mounted = false; + + if (workspace_mounted < 0) { + /* Nothing is mounted on the workspace yet, let's try to mount something now */ + + r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false); + if (r < 0) { + /* If that didn't work, try to make a bind mount from the final to the workspace, so + * that we can make it writable there. */ + r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) + /* Propagate anything that isn't a permission problem. */ + return r; + + if (must_mount) + /* If it's not OK to use the plain directory fallback, propagate all + * errors too. */ + return r; + + /* If we lack privileges to bind mount stuff, then let's gracefully proceed + * for compat with container envs, and just use the final dir as is. */ + + workspace_mounted = false; + } else { + /* Make the new bind mount writable (i.e. drop MS_RDONLY) */ + r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL); + if (r < 0) + return r; + + workspace_mounted = true; + } + } else + workspace_mounted = true; + } + + assert(!must_mount || workspace_mounted > 0); + where = workspace_mounted ? workspace : final; + + (void) label_fix_full(AT_FDCWD, where, final, 0); + + r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted); + if (r < 0) + return r; + + if (workspace_mounted) { + bool install; + + /* Determine if we should actually install the prepared mount in the final location by bind + * mounting it there. We do so only if the mount is not established there already, and if the + * mount is actually non-empty (i.e. carries at least one credential). Not that in the best + * case we are doing all this in a mount namespace, thus no one else will see that we + * allocated a file system we are getting rid of again here. */ + if (final_mounted) + install = false; /* already installed */ + else { + r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false); + if (r < 0) + return r; + + install = r == 0; /* install only if non-empty */ + } + + if (install) { + /* Make workspace read-only now, so that any bind mount we make from it defaults to + * read-only too */ + r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL); + if (r < 0) + return r; + + /* And mount it to the final place, read-only */ + r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL); + } else + /* Otherwise get rid of it */ + r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + return r; + } else { + _cleanup_free_ char *parent = NULL; + + /* If we do not have our own mount put used the plain directory fallback, then we need to + * open access to the top-level credential directory and the per-service directory now */ + + r = path_extract_directory(final, &parent); + if (r < 0) + return r; + if (chmod(parent, 0755) < 0) + return -errno; + } + + return 0; +} + +int exec_setup_credentials( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + uid_t uid, + gid_t gid) { + + _cleanup_free_ char *p = NULL, *q = NULL; + int r; + + assert(context); + assert(params); + + if (!exec_context_has_credentials(context)) + return 0; + + if (!params->prefix[EXEC_DIRECTORY_RUNTIME]) + return -EINVAL; + + /* This where we'll place stuff when we are done; this main credentials directory is world-readable, + * and the subdir we mount over with a read-only file system readable by the service's user */ + q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials"); + if (!q) + return -ENOMEM; + + r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */ + if (r < 0 && r != -EEXIST) + return r; + + p = path_join(q, unit); + if (!p) + return -ENOMEM; + + r = mkdir_label(p, 0700); /* per-unit dir: private to user */ + if (r < 0 && r != -EEXIST) + return r; + + r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL); + if (r < 0) { + _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */ + _cleanup_free_ char *t = NULL; + + /* If this is not a privilege or support issue then propagate the error */ + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + + /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving + * it into place, so that users can't access half-initialized credential stores. */ + t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials"); + if (!t) + return -ENOMEM; + + /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit + * directory outside of /run/credentials/ first, and then move it over to /run/credentials/ + * after it is fully set up */ + u = path_join(t, unit); + if (!u) + return -ENOMEM; + + FOREACH_STRING(i, t, u) { + r = mkdir_label(i, 0700); + if (r < 0 && r != -EEXIST) + return r; + } + + r = setup_credentials_internal( + context, + params, + unit, + p, /* final mount point */ + u, /* temporary workspace to overmount */ + true, /* reuse the workspace if it is already a mount */ + false, /* it's OK to fall back to a plain directory if we can't mount anything */ + uid, + gid); + if (r < 0) + return r; + + } else if (r == 0) { + + /* We managed to set up a mount namespace, and are now in a child. That's great. In this case + * we can use the same directory for all cases, after turning off propagation. Question + * though is: where do we turn off propagation exactly, and where do we place the workspace + * directory? We need some place that is guaranteed to be a mount point in the host, and + * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this, + * since we ultimately want to move the resulting file system there, i.e. we need propagation + * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that + * would be visible in the host mount table all the time, which we want to avoid. Hence, what + * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that + * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off + * propagation on the former, and then overmount the latter. + * + * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist + * for this purpose, but there are few other candidates that work equally well for us, and + * given that we do this in a privately namespaced short-lived single-threaded process that + * no one else sees this should be OK to do. */ + + /* Turn off propagation from our namespace to host */ + r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); + if (r < 0) + goto child_fail; + + r = setup_credentials_internal( + context, + params, + unit, + p, /* final mount point */ + "/dev/shm", /* temporary workspace to overmount */ + false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */ + true, /* insist that something is mounted, do not allow fallback to plain directory */ + uid, + gid); + if (r < 0) + goto child_fail; + + _exit(EXIT_SUCCESS); + + child_fail: + _exit(EXIT_FAILURE); + } + + /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's + * try to remove it. This matters in particular if we created the dir as mount point but then didn't + * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being + * seen by users when trying access this inode. */ + (void) rmdir(p); + return 0; +} diff --git a/src/core/exec-credential.h b/src/core/exec-credential.h new file mode 100644 index 0000000..6f836fb --- /dev/null +++ b/src/core/exec-credential.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "hash-funcs.h" + +typedef struct ExecContext ExecContext; +typedef struct ExecParameters ExecParameters; +typedef struct Unit Unit; + +/* A credential configured with LoadCredential= */ +typedef struct ExecLoadCredential { + char *id, *path; + bool encrypted; +} ExecLoadCredential; + +/* A credential configured with SetCredential= */ +typedef struct ExecSetCredential { + char *id; + bool encrypted; + void *data; + size_t size; +} ExecSetCredential; + +ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSetCredential*, exec_set_credential_free); + +ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecLoadCredential*, exec_load_credential_free); + +extern const struct hash_ops exec_set_credential_hash_ops; +extern const struct hash_ops exec_load_credential_hash_ops; + +bool exec_context_has_encrypted_credentials(ExecContext *c); +bool exec_context_has_credentials(const ExecContext *c); + +int exec_context_get_credential_directory( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + char **ret); + +int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c); + +int exec_context_destroy_credentials(Unit *u); +int exec_setup_credentials( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + uid_t uid, + gid_t gid); diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c new file mode 100644 index 0000000..28d6142 --- /dev/null +++ b/src/core/exec-invoke.c @@ -0,0 +1,5235 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#if HAVE_PAM +#include +#include +#endif + +#if HAVE_APPARMOR +#include +#endif + +#include "sd-messages.h" + +#if HAVE_APPARMOR +#include "apparmor-util.h" +#endif +#include "argv-util.h" +#include "barrier.h" +#include "bpf-dlopen.h" +#include "bpf-lsm.h" +#include "btrfs-util.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "chase.h" +#include "chattr-util.h" +#include "chown-recursive.h" +#include "copy.h" +#include "data-fd-util.h" +#include "env-util.h" +#include "escape.h" +#include "exec-credential.h" +#include "exec-invoke.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "iovec-util.h" +#include "missing_ioprio.h" +#include "missing_prctl.h" +#include "missing_securebits.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "psi-util.h" +#include "rlimit-util.h" +#include "seccomp-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "smack-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "strv.h" +#include "terminal-util.h" +#include "utmp-wtmp.h" + +#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC) +#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC) + +#define SNDBUF_SIZE (8*1024*1024) + +static int shift_fds(int fds[], size_t n_fds) { + if (n_fds <= 0) + return 0; + + /* Modifies the fds array! (sorts it) */ + + assert(fds); + + for (int start = 0;;) { + int restart_from = -1; + + for (int i = start; i < (int) n_fds; i++) { + int nfd; + + /* Already at right index? */ + if (fds[i] == i+3) + continue; + + nfd = fcntl(fds[i], F_DUPFD, i + 3); + if (nfd < 0) + return -errno; + + safe_close(fds[i]); + fds[i] = nfd; + + /* Hmm, the fd we wanted isn't free? Then + * let's remember that and try again from here */ + if (nfd != i+3 && restart_from < 0) + restart_from = i; + } + + if (restart_from < 0) + break; + + start = restart_from; + } + + return 0; +} + +static int flag_fds( + const int fds[], + size_t n_socket_fds, + size_t n_fds, + bool nonblock) { + + int r; + + assert(fds || n_fds == 0); + + /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags. + * O_NONBLOCK only applies to socket activation though. */ + + for (size_t i = 0; i < n_fds; i++) { + + if (i < n_socket_fds) { + r = fd_nonblock(fds[i], nonblock); + if (r < 0) + return r; + } + + /* We unconditionally drop FD_CLOEXEC from the fds, + * since after all we want to pass these fds to our + * children */ + + r = fd_cloexec(fds[i], false); + if (r < 0) + return r; + } + + return 0; +} + +static bool is_terminal_input(ExecInput i) { + return IN_SET(i, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL); +} + +static bool is_terminal_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE); +} + +static bool is_kmsg_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_KMSG_AND_CONSOLE); +} + +static bool exec_context_needs_term(const ExecContext *c) { + assert(c); + + /* Return true if the execution context suggests we should set $TERM to something useful. */ + + if (is_terminal_input(c->std_input)) + return true; + + if (is_terminal_output(c->std_output)) + return true; + + if (is_terminal_output(c->std_error)) + return true; + + return !!c->tty_path; +} + +static int open_null_as(int flags, int nfd) { + int fd; + + assert(nfd >= 0); + + fd = open("/dev/null", flags|O_NOCTTY); + if (fd < 0) + return -errno; + + return move_fd(fd, nfd, false); +} + +static int connect_journal_socket( + int fd, + const char *log_namespace, + uid_t uid, + gid_t gid) { + + uid_t olduid = UID_INVALID; + gid_t oldgid = GID_INVALID; + const char *j; + int r; + + j = log_namespace ? + strjoina("/run/systemd/journal.", log_namespace, "/stdout") : + "/run/systemd/journal/stdout"; + + if (gid_is_valid(gid)) { + oldgid = getgid(); + + if (setegid(gid) < 0) + return -errno; + } + + if (uid_is_valid(uid)) { + olduid = getuid(); + + if (seteuid(uid) < 0) { + r = -errno; + goto restore_gid; + } + } + + r = connect_unix_path(fd, AT_FDCWD, j); + + /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if + an LSM interferes. */ + + if (uid_is_valid(uid)) + (void) seteuid(olduid); + + restore_gid: + if (gid_is_valid(gid)) + (void) setegid(oldgid); + + return r; +} + +static int connect_logger_as( + const ExecContext *context, + const ExecParameters *params, + ExecOutput output, + const char *ident, + int nfd, + uid_t uid, + gid_t gid) { + + _cleanup_close_ int fd = -EBADF; + int r; + + assert(context); + assert(params); + assert(output < _EXEC_OUTPUT_MAX); + assert(ident); + assert(nfd >= 0); + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -errno; + + r = connect_journal_socket(fd, context->log_namespace, uid, gid); + if (r < 0) + return r; + + if (shutdown(fd, SHUT_RD) < 0) + return -errno; + + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + if (dprintf(fd, + "%s\n" + "%s\n" + "%i\n" + "%i\n" + "%i\n" + "%i\n" + "%i\n", + context->syslog_identifier ?: ident, + params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "", + context->syslog_priority, + !!context->syslog_level_prefix, + false, + is_kmsg_output(output), + is_terminal_output(output)) < 0) + return -errno; + + return move_fd(TAKE_FD(fd), nfd, false); +} + +static int open_terminal_as(const char *path, int flags, int nfd) { + int fd; + + assert(path); + assert(nfd >= 0); + + fd = open_terminal(path, flags | O_NOCTTY); + if (fd < 0) + return fd; + + return move_fd(fd, nfd, false); +} + +static int acquire_path(const char *path, int flags, mode_t mode) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(path); + + if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR)) + flags |= O_CREAT; + + fd = open(path, flags|O_NOCTTY, mode); + if (fd >= 0) + return TAKE_FD(fd); + + if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */ + return -errno; + + /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */ + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -errno; + + r = connect_unix_path(fd, AT_FDCWD, path); + if (IN_SET(r, -ENOTSOCK, -EINVAL)) + /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this + * wasn't an AF_UNIX socket after all */ + return -ENXIO; + if (r < 0) + return r; + + if ((flags & O_ACCMODE) == O_RDONLY) + r = shutdown(fd, SHUT_WR); + else if ((flags & O_ACCMODE) == O_WRONLY) + r = shutdown(fd, SHUT_RD); + else + r = 0; + if (r < 0) + return -errno; + + return TAKE_FD(fd); +} + +static int fixup_input( + const ExecContext *context, + int socket_fd, + bool apply_tty_stdin) { + + ExecInput std_input; + + assert(context); + + std_input = context->std_input; + + if (is_terminal_input(std_input) && !apply_tty_stdin) + return EXEC_INPUT_NULL; + + if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0) + return EXEC_INPUT_NULL; + + if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0) + return EXEC_INPUT_NULL; + + return std_input; +} + +static int fixup_output(ExecOutput output, int socket_fd) { + + if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0) + return EXEC_OUTPUT_INHERIT; + + return output; +} + +static int setup_input( + const ExecContext *context, + const ExecParameters *params, + int socket_fd, + const int named_iofds[static 3]) { + + ExecInput i; + int r; + + assert(context); + assert(params); + assert(named_iofds); + + if (params->stdin_fd >= 0) { + if (dup2(params->stdin_fd, STDIN_FILENO) < 0) + return -errno; + + /* Try to make this the controlling tty, if it is a tty, and reset it */ + if (isatty(STDIN_FILENO)) { + (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE); + + if (context->tty_reset) + (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true); + + (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL); + } + + return STDIN_FILENO; + } + + i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); + + switch (i) { + + case EXEC_INPUT_NULL: + return open_null_as(O_RDONLY, STDIN_FILENO); + + case EXEC_INPUT_TTY: + case EXEC_INPUT_TTY_FORCE: + case EXEC_INPUT_TTY_FAIL: { + _cleanup_close_ int tty_fd = -EBADF; + const char *tty_path; + + tty_path = ASSERT_PTR(exec_context_tty_path(context)); + + tty_fd = acquire_terminal(tty_path, + i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY : + i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE : + ACQUIRE_TERMINAL_WAIT, + USEC_INFINITY); + if (tty_fd < 0) + return tty_fd; + + r = exec_context_apply_tty_size(context, tty_fd, tty_path); + if (r < 0) + return r; + + r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false); + if (r < 0) + return r; + + TAKE_FD(tty_fd); + return r; + } + + case EXEC_INPUT_SOCKET: + assert(socket_fd >= 0); + + return RET_NERRNO(dup2(socket_fd, STDIN_FILENO)); + + case EXEC_INPUT_NAMED_FD: + assert(named_iofds[STDIN_FILENO] >= 0); + + (void) fd_nonblock(named_iofds[STDIN_FILENO], false); + return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO)); + + case EXEC_INPUT_DATA: { + int fd; + + fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + case EXEC_INPUT_FILE: { + bool rw; + int fd; + + assert(context->stdio_file[STDIN_FILENO]); + + rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) || + (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO])); + + fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + default: + assert_not_reached(); + } +} + +static bool can_inherit_stderr_from_stdout( + const ExecContext *context, + ExecOutput o, + ExecOutput e) { + + assert(context); + + /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the + * stderr fd */ + + if (e == EXEC_OUTPUT_INHERIT) + return true; + if (e != o) + return false; + + if (e == EXEC_OUTPUT_NAMED_FD) + return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]); + + if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE)) + return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]); + + return true; +} + +static int setup_output( + const ExecContext *context, + const ExecParameters *params, + int fileno, + int socket_fd, + const int named_iofds[static 3], + const char *ident, + uid_t uid, + gid_t gid, + dev_t *journal_stream_dev, + ino_t *journal_stream_ino) { + + ExecOutput o; + ExecInput i; + int r; + + assert(context); + assert(params); + assert(ident); + assert(journal_stream_dev); + assert(journal_stream_ino); + + if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) { + + if (dup2(params->stdout_fd, STDOUT_FILENO) < 0) + return -errno; + + return STDOUT_FILENO; + } + + if (fileno == STDERR_FILENO && params->stderr_fd >= 0) { + if (dup2(params->stderr_fd, STDERR_FILENO) < 0) + return -errno; + + return STDERR_FILENO; + } + + i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); + o = fixup_output(context->std_output, socket_fd); + + if (fileno == STDERR_FILENO) { + ExecOutput e; + e = fixup_output(context->std_error, socket_fd); + + /* This expects the input and output are already set up */ + + /* Don't change the stderr file descriptor if we inherit all + * the way and are not on a tty */ + if (e == EXEC_OUTPUT_INHERIT && + o == EXEC_OUTPUT_INHERIT && + i == EXEC_INPUT_NULL && + !is_terminal_input(context->std_input) && + getppid() != 1) + return fileno; + + /* Duplicate from stdout if possible */ + if (can_inherit_stderr_from_stdout(context, o, e)) + return RET_NERRNO(dup2(STDOUT_FILENO, fileno)); + + o = e; + + } else if (o == EXEC_OUTPUT_INHERIT) { + /* If input got downgraded, inherit the original value */ + if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input)) + return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); + + /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */ + if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA)) + return RET_NERRNO(dup2(STDIN_FILENO, fileno)); + + /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */ + if (getppid() != 1) + return fileno; + + /* We need to open /dev/null here anew, to get the right access mode. */ + return open_null_as(O_WRONLY, fileno); + } + + switch (o) { + + case EXEC_OUTPUT_NULL: + return open_null_as(O_WRONLY, fileno); + + case EXEC_OUTPUT_TTY: + if (is_terminal_input(i)) + return RET_NERRNO(dup2(STDIN_FILENO, fileno)); + + /* We don't reset the terminal if this is just about output */ + return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); + + case EXEC_OUTPUT_KMSG: + case EXEC_OUTPUT_KMSG_AND_CONSOLE: + case EXEC_OUTPUT_JOURNAL: + case EXEC_OUTPUT_JOURNAL_AND_CONSOLE: + r = connect_logger_as(context, params, o, ident, fileno, uid, gid); + if (r < 0) { + log_exec_warning_errno(context, + params, + r, + "Failed to connect %s to the journal socket, ignoring: %m", + fileno == STDOUT_FILENO ? "stdout" : "stderr"); + r = open_null_as(O_WRONLY, fileno); + } else { + struct stat st; + + /* If we connected this fd to the journal via a stream, patch the device/inode into the passed + * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits + * services to detect whether they are connected to the journal or not. + * + * If both stdout and stderr are connected to a stream then let's make sure to store the data + * about STDERR as that's usually the best way to do logging. */ + + if (fstat(fileno, &st) >= 0 && + (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) { + *journal_stream_dev = st.st_dev; + *journal_stream_ino = st.st_ino; + } + } + return r; + + case EXEC_OUTPUT_SOCKET: + assert(socket_fd >= 0); + + return RET_NERRNO(dup2(socket_fd, fileno)); + + case EXEC_OUTPUT_NAMED_FD: + assert(named_iofds[fileno] >= 0); + + (void) fd_nonblock(named_iofds[fileno], false); + return RET_NERRNO(dup2(named_iofds[fileno], fileno)); + + case EXEC_OUTPUT_FILE: + case EXEC_OUTPUT_FILE_APPEND: + case EXEC_OUTPUT_FILE_TRUNCATE: { + bool rw; + int fd, flags; + + assert(context->stdio_file[fileno]); + + rw = context->std_input == EXEC_INPUT_FILE && + streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]); + + if (rw) + return RET_NERRNO(dup2(STDIN_FILENO, fileno)); + + flags = O_WRONLY; + if (o == EXEC_OUTPUT_FILE_APPEND) + flags |= O_APPEND; + else if (o == EXEC_OUTPUT_FILE_TRUNCATE) + flags |= O_TRUNC; + + fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask); + if (fd < 0) + return fd; + + return move_fd(fd, fileno, 0); + } + + default: + assert_not_reached(); + } +} + +static int chown_terminal(int fd, uid_t uid) { + int r; + + assert(fd >= 0); + + /* Before we chown/chmod the TTY, let's ensure this is actually a tty */ + if (isatty(fd) < 1) { + if (IN_SET(errno, EINVAL, ENOTTY)) + return 0; /* not a tty */ + + return -errno; + } + + /* This might fail. What matters are the results. */ + r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID); + if (r < 0) + return r; + + return 1; +} + +static int setup_confirm_stdio( + const ExecContext *context, + const char *vc, + int *ret_saved_stdin, + int *ret_saved_stdout) { + + _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF; + int r; + + assert(ret_saved_stdin); + assert(ret_saved_stdout); + + saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3); + if (saved_stdin < 0) + return -errno; + + saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3); + if (saved_stdout < 0) + return -errno; + + fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC); + if (fd < 0) + return fd; + + r = chown_terminal(fd, getuid()); + if (r < 0) + return r; + + r = reset_terminal_fd(fd, /* switch_to_text= */ true); + if (r < 0) + return r; + + r = exec_context_apply_tty_size(context, fd, vc); + if (r < 0) + return r; + + r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */ + TAKE_FD(fd); + if (r < 0) + return r; + + *ret_saved_stdin = TAKE_FD(saved_stdin); + *ret_saved_stdout = TAKE_FD(saved_stdout); + return 0; +} + +static void write_confirm_error_fd(int err, int fd, const char *unit_id) { + assert(err < 0); + assert(unit_id); + + if (err == -ETIMEDOUT) + dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id); + else { + errno = -err; + dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id); + } +} + +static void write_confirm_error(int err, const char *vc, const char *unit_id) { + _cleanup_close_ int fd = -EBADF; + + assert(vc); + + fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return; + + write_confirm_error_fd(err, fd, unit_id); +} + +static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) { + int r = 0; + + assert(saved_stdin); + assert(saved_stdout); + + release_terminal(); + + if (*saved_stdin >= 0) + if (dup2(*saved_stdin, STDIN_FILENO) < 0) + r = -errno; + + if (*saved_stdout >= 0) + if (dup2(*saved_stdout, STDOUT_FILENO) < 0) + r = -errno; + + *saved_stdin = safe_close(*saved_stdin); + *saved_stdout = safe_close(*saved_stdout); + + return r; +} + +enum { + CONFIRM_PRETEND_FAILURE = -1, + CONFIRM_PRETEND_SUCCESS = 0, + CONFIRM_EXECUTE = 1, +}; + +static bool confirm_spawn_disabled(void) { + return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0; +} + +static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) { + int saved_stdout = -1, saved_stdin = -1, r; + _cleanup_free_ char *e = NULL; + char c; + + assert(context); + assert(params); + + /* For any internal errors, assume a positive response. */ + r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout); + if (r < 0) { + write_confirm_error(r, params->confirm_spawn, params->unit_id); + return CONFIRM_EXECUTE; + } + + /* confirm_spawn might have been disabled while we were sleeping. */ + if (!params->confirm_spawn || confirm_spawn_disabled()) { + r = 1; + goto restore_stdio; + } + + e = ellipsize(cmdline, 60, 100); + if (!e) { + log_oom(); + r = CONFIRM_EXECUTE; + goto restore_stdio; + } + + for (;;) { + r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e); + if (r < 0) { + write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id); + r = CONFIRM_EXECUTE; + goto restore_stdio; + } + + switch (c) { + case 'c': + printf("Resuming normal execution.\n"); + manager_disable_confirm_spawn(); + r = 1; + break; + case 'D': + printf(" Unit: %s\n", + params->unit_id); + exec_context_dump(context, stdout, " "); + exec_params_dump(params, stdout, " "); + continue; /* ask again */ + case 'f': + printf("Failing execution.\n"); + r = CONFIRM_PRETEND_FAILURE; + break; + case 'h': + printf(" c - continue, proceed without asking anymore\n" + " D - dump, show the state of the unit\n" + " f - fail, don't execute the command and pretend it failed\n" + " h - help\n" + " i - info, show a short summary of the unit\n" + " j - jobs, show jobs that are in progress\n" + " s - skip, don't execute the command and pretend it succeeded\n" + " y - yes, execute the command\n"); + continue; /* ask again */ + case 'i': + printf(" Unit: %s\n" + " Command: %s\n", + params->unit_id, cmdline); + continue; /* ask again */ + case 'j': + if (sigqueue(getppid(), + SIGRTMIN+18, + (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0) + return -errno; + + continue; /* ask again */ + case 'n': + /* 'n' was removed in favor of 'f'. */ + printf("Didn't understand 'n', did you mean 'f'?\n"); + continue; /* ask again */ + case 's': + printf("Skipping execution.\n"); + r = CONFIRM_PRETEND_SUCCESS; + break; + case 'y': + r = CONFIRM_EXECUTE; + break; + default: + assert_not_reached(); + } + break; + } + +restore_stdio: + restore_confirm_stdio(&saved_stdin, &saved_stdout); + return r; +} + +static int get_fixed_user( + const char *user_or_uid, + const char **ret_username, + uid_t *ret_uid, + gid_t *ret_gid, + const char **ret_home, + const char **ret_shell) { + + int r; + + assert(user_or_uid); + assert(ret_username); + + /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway + * (i.e. are "/" or "/bin/nologin"). */ + + r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN); + if (r < 0) + return r; + + /* user_or_uid is normalized by get_user_creds to username */ + *ret_username = user_or_uid; + + return 0; +} + +static int get_fixed_group( + const char *group_or_gid, + const char **ret_groupname, + gid_t *ret_gid) { + + int r; + + assert(group_or_gid); + assert(ret_groupname); + + r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0); + if (r < 0) + return r; + + /* group_or_gid is normalized by get_group_creds to groupname */ + *ret_groupname = group_or_gid; + + return 0; +} + +static int get_supplementary_groups(const ExecContext *c, const char *user, + const char *group, gid_t gid, + gid_t **supplementary_gids, int *ngids) { + int r, k = 0; + int ngroups_max; + bool keep_groups = false; + gid_t *groups = NULL; + _cleanup_free_ gid_t *l_gids = NULL; + + assert(c); + + /* + * If user is given, then lookup GID and supplementary groups list. + * We avoid NSS lookups for gid=0. Also we have to initialize groups + * here and as early as possible so we keep the list of supplementary + * groups of the caller. + */ + if (user && gid_is_valid(gid) && gid != 0) { + /* First step, initialize groups from /etc/groups */ + if (initgroups(user, gid) < 0) + return -errno; + + keep_groups = true; + } + + if (strv_isempty(c->supplementary_groups)) + return 0; + + /* + * If SupplementaryGroups= was passed then NGROUPS_MAX has to + * be positive, otherwise fail. + */ + errno = 0; + ngroups_max = (int) sysconf(_SC_NGROUPS_MAX); + if (ngroups_max <= 0) + return errno_or_else(EOPNOTSUPP); + + l_gids = new(gid_t, ngroups_max); + if (!l_gids) + return -ENOMEM; + + if (keep_groups) { + /* + * Lookup the list of groups that the user belongs to, we + * avoid NSS lookups here too for gid=0. + */ + k = ngroups_max; + if (getgrouplist(user, gid, l_gids, &k) < 0) + return -EINVAL; + } else + k = 0; + + STRV_FOREACH(i, c->supplementary_groups) { + const char *g; + + if (k >= ngroups_max) + return -E2BIG; + + g = *i; + r = get_group_creds(&g, l_gids+k, 0); + if (r < 0) + return r; + + k++; + } + + /* + * Sets ngids to zero to drop all supplementary groups, happens + * when we are under root and SupplementaryGroups= is empty. + */ + if (k == 0) { + *ngids = 0; + return 0; + } + + /* Otherwise get the final list of supplementary groups */ + groups = memdup(l_gids, sizeof(gid_t) * k); + if (!groups) + return -ENOMEM; + + *supplementary_gids = groups; + *ngids = k; + + groups = NULL; + + return 0; +} + +static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) { + int r; + + /* Handle SupplementaryGroups= if it is not empty */ + if (ngids > 0) { + r = maybe_setgroups(ngids, supplementary_gids); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + /* Then set our gids */ + if (setresgid(gid, gid, gid) < 0) + return -errno; + } + + return 0; +} + +static int set_securebits(unsigned bits, unsigned mask) { + unsigned applied; + int current; + + current = prctl(PR_GET_SECUREBITS); + if (current < 0) + return -errno; + + /* Clear all securebits defined in mask and set bits */ + applied = ((unsigned) current & ~mask) | bits; + if ((unsigned) current == applied) + return 0; + + if (prctl(PR_SET_SECUREBITS, applied) < 0) + return -errno; + + return 1; +} + +static int enforce_user( + const ExecContext *context, + uid_t uid, + uint64_t capability_ambient_set) { + assert(context); + int r; + + if (!uid_is_valid(uid)) + return 0; + + /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For + * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this + * case. */ + + if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) { + + /* First step: If we need to keep capabilities but drop privileges we need to make sure we + * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */ + r = set_securebits(1U << SECURE_KEEP_CAPS, 0); + if (r < 0) + return r; + } + + /* Second step: actually set the uids */ + if (setresuid(uid, uid, uid) < 0) + return -errno; + + /* At this point we should have all necessary capabilities but are otherwise a normal user. However, + * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done + * outside of this call. */ + return 0; +} + +#if HAVE_PAM + +static int null_conv( + int num_msg, + const struct pam_message **msg, + struct pam_response **resp, + void *appdata_ptr) { + + /* We don't support conversations */ + + return PAM_CONV_ERR; +} + +static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) { + int r, s; + + assert(handle); + + r = pam_close_session(handle, flags); + if (r != PAM_SUCCESS) + log_debug("pam_close_session() failed: %s", pam_strerror(handle, r)); + + s = pam_setcred(handle, PAM_DELETE_CRED | flags); + if (s != PAM_SUCCESS) + log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s)); + + return r != PAM_SUCCESS ? r : s; +} + +#endif + +static int setup_pam( + const char *name, + const char *user, + uid_t uid, + gid_t gid, + const char *tty, + char ***env, /* updated on success */ + const int fds[], size_t n_fds) { + +#if HAVE_PAM + + static const struct pam_conv conv = { + .conv = null_conv, + .appdata_ptr = NULL + }; + + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; + _cleanup_strv_free_ char **e = NULL; + pam_handle_t *handle = NULL; + sigset_t old_ss; + int pam_code = PAM_SUCCESS, r; + bool close_session = false; + pid_t pam_pid = 0, parent_pid; + int flags = 0; + + assert(name); + assert(user); + assert(env); + + /* We set up PAM in the parent process, then fork. The child + * will then stay around until killed via PR_GET_PDEATHSIG or + * systemd via the cgroup logic. It will then remove the PAM + * session again. The parent process will exec() the actual + * daemon. We do things this way to ensure that the main PID + * of the daemon is the one we initially fork()ed. */ + + r = barrier_create(&barrier); + if (r < 0) + goto fail; + + if (log_get_max_level() < LOG_DEBUG) + flags |= PAM_SILENT; + + pam_code = pam_start(name, user, &conv, &handle); + if (pam_code != PAM_SUCCESS) { + handle = NULL; + goto fail; + } + + if (!tty) { + _cleanup_free_ char *q = NULL; + + /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure + * out if that's the case, and read the TTY off it. */ + + if (getttyname_malloc(STDIN_FILENO, &q) >= 0) + tty = strjoina("/dev/", q); + } + + if (tty) { + pam_code = pam_set_item(handle, PAM_TTY, tty); + if (pam_code != PAM_SUCCESS) + goto fail; + } + + STRV_FOREACH(nv, *env) { + pam_code = pam_putenv(handle, *nv); + if (pam_code != PAM_SUCCESS) + goto fail; + } + + pam_code = pam_acct_mgmt(handle, flags); + if (pam_code != PAM_SUCCESS) + goto fail; + + pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags); + if (pam_code != PAM_SUCCESS) + log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code)); + + pam_code = pam_open_session(handle, flags); + if (pam_code != PAM_SUCCESS) + goto fail; + + close_session = true; + + e = pam_getenvlist(handle); + if (!e) { + pam_code = PAM_BUF_ERR; + goto fail; + } + + /* Block SIGTERM, so that we know that it won't get lost in the child */ + + assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0); + + parent_pid = getpid_cached(); + + r = safe_fork("(sd-pam)", 0, &pam_pid); + if (r < 0) + goto fail; + if (r == 0) { + int sig, ret = EXIT_PAM; + + /* The child's job is to reset the PAM session on termination */ + barrier_set_role(&barrier, BARRIER_CHILD); + + /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only + * those fds are open here that have been opened by PAM. */ + (void) close_many(fds, n_fds); + + /* Drop privileges - we don't need any to pam_close_session and this will make + * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam + * threads to fail to exit normally */ + + r = maybe_setgroups(0, NULL); + if (r < 0) + log_warning_errno(r, "Failed to setgroups() in sd-pam: %m"); + if (setresgid(gid, gid, gid) < 0) + log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); + if (setresuid(uid, uid, uid) < 0) + log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m"); + + (void) ignore_signals(SIGPIPE); + + /* Wait until our parent died. This will only work if the above setresuid() succeeds, + * otherwise the kernel will not allow unprivileged parents kill their privileged children + * this way. We rely on the control groups kill logic to do the rest for us. */ + if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0) + goto child_finish; + + /* Tell the parent that our setup is done. This is especially important regarding dropping + * privileges. Otherwise, unit setup might race against our setresuid(2) call. + * + * If the parent aborted, we'll detect this below, hence ignore return failure here. */ + (void) barrier_place(&barrier); + + /* Check if our parent process might already have died? */ + if (getppid() == parent_pid) { + sigset_t ss; + + assert_se(sigemptyset(&ss) >= 0); + assert_se(sigaddset(&ss, SIGTERM) >= 0); + + for (;;) { + if (sigwait(&ss, &sig) < 0) { + if (errno == EINTR) + continue; + + goto child_finish; + } + + assert(sig == SIGTERM); + break; + } + } + + /* If our parent died we'll end the session */ + if (getppid() != parent_pid) { + pam_code = pam_close_session_and_delete_credentials(handle, flags); + if (pam_code != PAM_SUCCESS) + goto child_finish; + } + + ret = 0; + + child_finish: + /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module + * know about this. See pam_end(3) */ + (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT); + _exit(ret); + } + + barrier_set_role(&barrier, BARRIER_PARENT); + + /* If the child was forked off successfully it will do all the cleanups, so forget about the handle + * here. */ + handle = NULL; + + /* Unblock SIGTERM again in the parent */ + assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0); + + /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want + * this fd around. */ + closelog(); + + /* Synchronously wait for the child to initialize. We don't care for errors as we cannot + * recover. However, warn loudly if it happens. */ + if (!barrier_place_and_sync(&barrier)) + log_error("PAM initialization failed"); + + return strv_free_and_replace(*env, e); + +fail: + if (pam_code != PAM_SUCCESS) { + log_error("PAM failed: %s", pam_strerror(handle, pam_code)); + r = -EPERM; /* PAM errors do not map to errno */ + } else + log_error_errno(r, "PAM failed: %m"); + + if (handle) { + if (close_session) + pam_code = pam_close_session_and_delete_credentials(handle, flags); + + (void) pam_end(handle, pam_code | flags); + } + + closelog(); + return r; +#else + return 0; +#endif +} + +static void rename_process_from_path(const char *path) { + _cleanup_free_ char *buf = NULL; + const char *p; + + assert(path); + + /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in + * /bin/ps */ + + if (path_extract_filename(path, &buf) < 0) { + rename_process("(...)"); + return; + } + + size_t l = strlen(buf); + if (l > 8) { + /* The end of the process name is usually more interesting, since the first bit might just be + * "systemd-" */ + p = buf + l - 8; + l = 8; + } else + p = buf; + + char process_name[11]; + process_name[0] = '('; + memcpy(process_name+1, p, l); + process_name[1+l] = ')'; + process_name[1+l+1] = 0; + + rename_process(process_name); +} + +static bool context_has_address_families(const ExecContext *c) { + assert(c); + + return c->address_families_allow_list || + !set_isempty(c->address_families); +} + +static bool context_has_syscall_filters(const ExecContext *c) { + assert(c); + + return c->syscall_allow_list || + !hashmap_isempty(c->syscall_filter); +} + +static bool context_has_syscall_logs(const ExecContext *c) { + assert(c); + + return c->syscall_log_allow_list || + !hashmap_isempty(c->syscall_log); +} + +static bool context_has_seccomp(const ExecContext *c) { + /* We need NNP if we have any form of seccomp and are unprivileged */ + return c->lock_personality || + c->memory_deny_write_execute || + c->private_devices || + c->protect_clock || + c->protect_hostname || + c->protect_kernel_tunables || + c->protect_kernel_modules || + c->protect_kernel_logs || + context_has_address_families(c) || + exec_context_restrict_namespaces_set(c) || + c->restrict_realtime || + c->restrict_suid_sgid || + !set_isempty(c->syscall_archs) || + context_has_syscall_filters(c) || + context_has_syscall_logs(c); +} + +static bool context_has_no_new_privileges(const ExecContext *c) { + assert(c); + + if (c->no_new_privileges) + return true; + + if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */ + return false; + + return context_has_seccomp(c); +} + +#if HAVE_SECCOMP + +static bool seccomp_allows_drop_privileges(const ExecContext *c) { + void *id, *val; + bool has_capget = false, has_capset = false, has_prctl = false; + + assert(c); + + /* No syscall filter, we are allowed to drop privileges */ + if (hashmap_isempty(c->syscall_filter)) + return true; + + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) { + _cleanup_free_ char *name = NULL; + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + + if (streq(name, "capget")) + has_capget = true; + else if (streq(name, "capset")) + has_capset = true; + else if (streq(name, "prctl")) + has_prctl = true; + } + + if (c->syscall_allow_list) + return has_capget && has_capset && has_prctl; + else + return !(has_capget || has_capset || has_prctl); +} + +static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) { + + if (is_seccomp_available()) + return false; + + log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg); + return true; +} + +static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) { + uint32_t negative_action, default_action, action; + int r; + + assert(c); + assert(p); + + if (!context_has_syscall_filters(c)) + return 0; + + if (skip_seccomp_unavailable(c, p, "SystemCallFilter=")) + return 0; + + negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno); + + if (c->syscall_allow_list) { + default_action = negative_action; + action = SCMP_ACT_ALLOW; + } else { + default_action = SCMP_ACT_ALLOW; + action = negative_action; + } + + if (needs_ambient_hack) { + r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID); + if (r < 0) + return r; + } + + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false); +} + +static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) { +#ifdef SCMP_ACT_LOG + uint32_t default_action, action; +#endif + + assert(c); + assert(p); + + if (!context_has_syscall_logs(c)) + return 0; + +#ifdef SCMP_ACT_LOG + if (skip_seccomp_unavailable(c, p, "SystemCallLog=")) + return 0; + + if (c->syscall_log_allow_list) { + /* Log nothing but the ones listed */ + default_action = SCMP_ACT_ALLOW; + action = SCMP_ACT_LOG; + } else { + /* Log everything but the ones listed */ + default_action = SCMP_ACT_LOG; + action = SCMP_ACT_ALLOW; + } + + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false); +#else + /* old libseccomp */ + log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog="); + return 0; +#endif +} + +static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (set_isempty(c->syscall_archs)) + return 0; + + if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures=")) + return 0; + + return seccomp_restrict_archs(c->syscall_archs); +} + +static int apply_address_families(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (!context_has_address_families(c)) + return 0; + + if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies=")) + return 0; + + return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list); +} + +static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) { + int r; + + assert(c); + assert(p); + + if (!c->memory_deny_write_execute) + return 0; + + /* use prctl() if kernel supports it (6.3) */ + r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0); + if (r == 0) { + log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE"); + return 0; + } + if (r < 0 && errno != EINVAL) + return log_exec_debug_errno(c, + p, + errno, + "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m"); + /* else use seccomp */ + log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp"); + + if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute=")) + return 0; + + return seccomp_memory_deny_write_execute(); +} + +static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (!c->restrict_realtime) + return 0; + + if (skip_seccomp_unavailable(c, p, "RestrictRealtime=")) + return 0; + + return seccomp_restrict_realtime(); +} + +static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (!c->restrict_suid_sgid) + return 0; + + if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID=")) + return 0; + + return seccomp_restrict_suid_sgid(); +} + +static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but + * let's protect even those systems where this is left on in the kernel. */ + + if (!c->protect_kernel_tunables) + return 0; + + if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables=")) + return 0; + + return seccomp_protect_sysctl(); +} + +static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + /* Turn off module syscalls on ProtectKernelModules=yes */ + + if (!c->protect_kernel_modules) + return 0; + + if (skip_seccomp_unavailable(c, p, "ProtectKernelModules=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (!c->protect_kernel_logs) + return 0; + + if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs=")) + return 0; + + return seccomp_protect_syslog(); +} + +static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (!c->protect_clock) + return 0; + + if (skip_seccomp_unavailable(c, p, "ProtectClock=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_private_devices(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ + + if (!c->private_devices) + return 0; + + if (skip_seccomp_unavailable(c, p, "PrivateDevices=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) { + assert(c); + assert(p); + + if (!exec_context_restrict_namespaces_set(c)) + return 0; + + if (skip_seccomp_unavailable(c, p, "RestrictNamespaces=")) + return 0; + + return seccomp_restrict_namespaces(c->restrict_namespaces); +} + +static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) { + unsigned long personality; + int r; + + assert(c); + assert(p); + + if (!c->lock_personality) + return 0; + + if (skip_seccomp_unavailable(c, p, "LockPersonality=")) + return 0; + + personality = c->personality; + + /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */ + if (personality == PERSONALITY_INVALID) { + + r = opinionated_personality(&personality); + if (r < 0) + return r; + } + + return seccomp_lock_personality(personality); +} + +#endif + +#if HAVE_LIBBPF +static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) { + int r; + + assert(c); + assert(p); + + if (!exec_context_restrict_filesystems_set(c)) + return 0; + + if (p->bpf_outer_map_fd < 0) { + /* LSM BPF is unsupported or lsm_bpf_setup failed */ + log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems="); + return 0; + } + + /* We are in a new binary, so dl-open again */ + r = dlopen_bpf(); + if (r < 0) + return r; + + return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list); +} +#endif + +static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) { + assert(c); + assert(p); + + if (!c->protect_hostname) + return 0; + + if (ns_type_supported(NAMESPACE_UTS)) { + if (unshare(CLONE_NEWUTS) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) { + *ret_exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(c, + p, + errno, + "Failed to set up UTS namespacing: %m"); + } + + log_exec_warning(c, + p, + "ProtectHostname=yes is configured, but UTS namespace setup is " + "prohibited (container manager?), ignoring namespace setup."); + } + } else + log_exec_warning(c, + p, + "ProtectHostname=yes is configured, but the kernel does not " + "support UTS namespaces, ignoring namespace setup."); + +#if HAVE_SECCOMP + int r; + + if (skip_seccomp_unavailable(c, p, "ProtectHostname=")) + return 0; + + r = seccomp_protect_hostname(); + if (r < 0) { + *ret_exit_status = EXIT_SECCOMP; + return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m"); + } +#endif + + return 0; +} + +static void do_idle_pipe_dance(int idle_pipe[static 4]) { + assert(idle_pipe); + + idle_pipe[1] = safe_close(idle_pipe[1]); + idle_pipe[2] = safe_close(idle_pipe[2]); + + if (idle_pipe[0] >= 0) { + int r; + + r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC); + + if (idle_pipe[3] >= 0 && r == 0 /* timeout */) { + ssize_t n; + + /* Signal systemd that we are bored and want to continue. */ + n = write(idle_pipe[3], "x", 1); + if (n > 0) + /* Wait for systemd to react to the signal above. */ + (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC); + } + + idle_pipe[0] = safe_close(idle_pipe[0]); + + } + + idle_pipe[3] = safe_close(idle_pipe[3]); +} + +static const char *exec_directory_env_name_to_string(ExecDirectoryType t); + +/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to + * the service payload in. */ +static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY", + [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY", + [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY", + [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY", + [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType); + +static int build_environment( + const ExecContext *c, + const ExecParameters *p, + const CGroupContext *cgroup_context, + size_t n_fds, + const char *home, + const char *username, + const char *shell, + dev_t journal_stream_dev, + ino_t journal_stream_ino, + const char *memory_pressure_path, + char ***ret) { + + _cleanup_strv_free_ char **our_env = NULL; + size_t n_env = 0; + char *x; + int r; + + assert(c); + assert(p); + assert(ret); + +#define N_ENV_VARS 19 + our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); + if (!our_env) + return -ENOMEM; + + if (n_fds > 0) { + _cleanup_free_ char *joined = NULL; + + if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + joined = strv_join(p->fd_names, ":"); + if (!joined) + return -ENOMEM; + + x = strjoin("LISTEN_FDNAMES=", joined); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) { + if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0) + return -ENOMEM; + our_env[n_env++] = x; + } + + /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking + * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and + * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */ + if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) { + x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1"); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME + * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't + * really make much sense since we're not logged in. Hence we conditionalize the three based on + * SetLoginEnvironment= switch. */ + if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) { + r = get_fixed_user("root", &username, NULL, NULL, &home, &shell); + if (r < 0) + return log_exec_debug_errno(c, + p, + r, + "Failed to determine user credentials for root: %m"); + } + + bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user); + + if (username) { + x = strjoin("USER=", username); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + + if (set_user_login_env) { + x = strjoin("LOGNAME=", username); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + } + + if (home && set_user_login_env) { + x = strjoin("HOME=", home); + if (!x) + return -ENOMEM; + + path_simplify(x + 5); + our_env[n_env++] = x; + } + + if (shell && set_user_login_env) { + x = strjoin("SHELL=", shell); + if (!x) + return -ENOMEM; + + path_simplify(x + 6); + our_env[n_env++] = x; + } + + if (!sd_id128_is_null(p->invocation_id)) { + assert(p->invocation_id_string); + + x = strjoin("INVOCATION_ID=", p->invocation_id_string); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (exec_context_needs_term(c)) { + _cleanup_free_ char *cmdline = NULL; + const char *tty_path, *term = NULL; + + tty_path = exec_context_tty_path(c); + + /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try + * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the + * container manager passes to PID 1 ends up all the way in the console login shown. */ + + if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1) + term = getenv("TERM"); + else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) { + _cleanup_free_ char *key = NULL; + + key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path)); + if (!key) + return -ENOMEM; + + r = proc_cmdline_get_key(key, 0, &cmdline); + if (r < 0) + log_exec_debug_errno(c, + p, + r, + "Failed to read %s from kernel cmdline, ignoring: %m", + key); + else if (r > 0) + term = cmdline; + } + + if (!term) + term = default_term_for_tty(tty_path); + + x = strjoin("TERM=", term); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (journal_stream_dev != 0 && journal_stream_ino != 0) { + if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (c->log_namespace) { + x = strjoin("LOG_NAMESPACE=", c->log_namespace); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + _cleanup_free_ char *joined = NULL; + const char *n; + + if (!p->prefix[t]) + continue; + + if (c->directories[t].n_items == 0) + continue; + + n = exec_directory_env_name_to_string(t); + if (!n) + continue; + + for (size_t i = 0; i < c->directories[t].n_items; i++) { + _cleanup_free_ char *prefixed = NULL; + + prefixed = path_join(p->prefix[t], c->directories[t].items[i].path); + if (!prefixed) + return -ENOMEM; + + if (!strextend_with_separator(&joined, ":", prefixed)) + return -ENOMEM; + } + + x = strjoin(n, "=", joined); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + _cleanup_free_ char *creds_dir = NULL; + r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir); + if (r < 0) + return r; + if (r > 0) { + x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + + if (memory_pressure_path) { + x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + + if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) { + _cleanup_free_ char *b = NULL, *e = NULL; + + if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT, + MEMORY_PRESSURE_DEFAULT_TYPE, + cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC : + CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC), + MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0) + return -ENOMEM; + + if (base64mem(b, strlen(b) + 1, &e) < 0) + return -ENOMEM; + + x = strjoin("MEMORY_PRESSURE_WRITE=", e); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + } + + assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); +#undef N_ENV_VARS + + *ret = TAKE_PTR(our_env); + + return 0; +} + +static int build_pass_environment(const ExecContext *c, char ***ret) { + _cleanup_strv_free_ char **pass_env = NULL; + size_t n_env = 0; + + STRV_FOREACH(i, c->pass_environment) { + _cleanup_free_ char *x = NULL; + char *v; + + v = getenv(*i); + if (!v) + continue; + x = strjoin(*i, "=", v); + if (!x) + return -ENOMEM; + + if (!GREEDY_REALLOC(pass_env, n_env + 2)) + return -ENOMEM; + + pass_env[n_env++] = TAKE_PTR(x); + pass_env[n_env] = NULL; + } + + *ret = TAKE_PTR(pass_env); + + return 0; +} + +static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { + _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; + _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; + _cleanup_close_ int unshare_ready_fd = -EBADF; + _cleanup_(sigkill_waitp) pid_t pid = 0; + uint64_t c = 1; + ssize_t n; + int r; + + /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e. + * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to + * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which + * we however lack after opening the user namespace. To work around this we fork() a temporary child process, + * which waits for the parent to create the new user namespace while staying in the original namespace. The + * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and + * continues execution normally. + * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it + * does not need CAP_SETUID to write the single line mapping to itself. */ + + /* Can only set up multiple mappings with CAP_SETUID. */ + if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) + r = asprintf(&uid_map, + UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */ + UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ + ouid, ouid, uid, uid); + else + r = asprintf(&uid_map, + UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */ + ouid, ouid); + + if (r < 0) + return -ENOMEM; + + /* Can only set up multiple mappings with CAP_SETGID. */ + if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) + r = asprintf(&gid_map, + GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */ + GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */ + ogid, ogid, gid, gid); + else + r = asprintf(&gid_map, + GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */ + ogid, ogid); + + if (r < 0) + return -ENOMEM; + + /* Create a communication channel so that the parent can tell the child when it finished creating the user + * namespace. */ + unshare_ready_fd = eventfd(0, EFD_CLOEXEC); + if (unshare_ready_fd < 0) + return -errno; + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return -errno; + + r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid); + if (r < 0) + return r; + if (r == 0) { + _cleanup_close_ int fd = -EBADF; + const char *a; + pid_t ppid; + + /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from + * here, after the parent opened its own user namespace. */ + + ppid = getppid(); + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Wait until the parent unshared the user namespace */ + if (read(unshare_ready_fd, &c, sizeof(c)) < 0) { + r = -errno; + goto child_fail; + } + + /* Disable the setgroups() system call in the child user namespace, for good. */ + a = procfs_file_alloca(ppid, "setgroups"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) { + r = -errno; + goto child_fail; + } + + /* If the file is missing the kernel is too old, let's continue anyway. */ + } else { + if (write(fd, "deny\n", 5) < 0) { + r = -errno; + goto child_fail; + } + + fd = safe_close(fd); + } + + /* First write the GID map */ + a = procfs_file_alloca(ppid, "gid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, gid_map, strlen(gid_map)) < 0) { + r = -errno; + goto child_fail; + } + fd = safe_close(fd); + + /* The write the UID map */ + a = procfs_file_alloca(ppid, "uid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, uid_map, strlen(uid_map)) < 0) { + r = -errno; + goto child_fail; + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + if (unshare(CLONE_NEWUSER) < 0) + return -errno; + + /* Let the child know that the namespace is ready now */ + if (write(unshare_ready_fd, &c, sizeof(c)) < 0) + return -errno; + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return -errno; + if (n == sizeof(r)) { /* an error code was sent to us */ + if (r < 0) + return r; + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0); + if (r < 0) + return r; + if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */ + return -EIO; + + return 0; +} + +static int create_many_symlinks(const char *root, const char *source, char **symlinks) { + _cleanup_free_ char *src_abs = NULL; + int r; + + assert(source); + + src_abs = path_join(root, source); + if (!src_abs) + return -ENOMEM; + + STRV_FOREACH(dst, symlinks) { + _cleanup_free_ char *dst_abs = NULL; + + dst_abs = path_join(root, *dst); + if (!dst_abs) + return -ENOMEM; + + r = mkdir_parents_label(dst_abs, 0755); + if (r < 0) + return r; + + r = symlink_idempotent(src_abs, dst_abs, true); + if (r < 0) + return r; + } + + return 0; +} + +static int setup_exec_directory( + const ExecContext *context, + const ExecParameters *params, + uid_t uid, + gid_t gid, + ExecDirectoryType type, + bool needs_mount_namespace, + int *exit_status) { + + static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY, + [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY, + [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY, + [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY, + [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY, + }; + int r; + + assert(context); + assert(params); + assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX); + assert(exit_status); + + if (!params->prefix[type]) + return 0; + + if (params->flags & EXEC_CHOWN_DIRECTORIES) { + if (!uid_is_valid(uid)) + uid = 0; + if (!gid_is_valid(gid)) + gid = 0; + } + + for (size_t i = 0; i < context->directories[type].n_items; i++) { + _cleanup_free_ char *p = NULL, *pp = NULL; + + p = path_join(params->prefix[type], context->directories[type].items[i].path); + if (!p) { + r = -ENOMEM; + goto fail; + } + + r = mkdir_parents_label(p, 0755); + if (r < 0) + goto fail; + + if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) { + + /* If we are in user mode, and a configuration directory exists but a state directory + * doesn't exist, then we likely are upgrading from an older systemd version that + * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME + * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to + * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now + * separated. If a service has both dirs configured but only the configuration dir + * exists and the state dir does not, we assume we are looking at an update + * situation. Hence, create a compatibility symlink, so that all expectations are + * met. + * + * (We also do something similar with the log directory, which still doesn't exist in + * the xdg basedir spec. We'll make it a subdir of the state dir.) */ + + /* this assumes the state dir is always created before the configuration dir */ + assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS); + assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION); + + r = laccess(p, F_OK); + if (r == -ENOENT) { + _cleanup_free_ char *q = NULL; + + /* OK, we know that the state dir does not exist. Let's see if the dir exists + * under the configuration hierarchy. */ + + if (type == EXEC_DIRECTORY_STATE) + q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path); + else if (type == EXEC_DIRECTORY_LOGS) + q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path); + else + assert_not_reached(); + if (!q) { + r = -ENOMEM; + goto fail; + } + + r = laccess(q, F_OK); + if (r >= 0) { + /* It does exist! This hence looks like an update. Symlink the + * configuration directory into the state directory. */ + + r = symlink_idempotent(q, p, /* make_relative= */ true); + if (r < 0) + goto fail; + + log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q); + continue; + } else if (r != -ENOENT) + log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q); + + } else if (r < 0) + log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p); + } + + if (exec_directory_is_private(context, type)) { + /* So, here's one extra complication when dealing with DynamicUser=1 units. In that + * case we want to avoid leaving a directory around fully accessible that is owned by + * a dynamic user whose UID is later on reused. To lock this down we use the same + * trick used by container managers to prohibit host users to get access to files of + * the same UID in containers: we place everything inside a directory that has an + * access mode of 0700 and is owned root:root, so that it acts as security boundary + * for unprivileged host code. We then use fs namespacing to make this directory + * permeable for the service itself. + * + * Specifically: for a service which wants a special directory "foo/" we first create + * a directory "private/" with access mode 0700 owned by root:root. Then we place + * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to + * "private/foo". This way, privileged host users can access "foo/" as usual, but + * unprivileged host users can't look into it. Inside of the namespace of the unit + * "private/" is replaced by a more liberally accessible tmpfs, into which the host's + * "private/foo/" is mounted under the same name, thus disabling the access boundary + * for the service and making sure it only gets access to the dirs it needs but no + * others. Tricky? Yes, absolutely, but it works! + * + * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not + * to be owned by the service itself. + * + * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used + * for sharing files or sockets with other services. */ + + pp = path_join(params->prefix[type], "private"); + if (!pp) { + r = -ENOMEM; + goto fail; + } + + /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */ + r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + if (!path_extend(&pp, context->directories[type].items[i].path)) { + r = -ENOMEM; + goto fail; + } + + /* Create all directories between the configured directory and this private root, and mark them 0755 */ + r = mkdir_parents_label(pp, 0755); + if (r < 0) + goto fail; + + if (is_dir(p, false) > 0 && + (laccess(pp, F_OK) == -ENOENT)) { + + /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move + * it over. Most likely the service has been upgraded from one that didn't use + * DynamicUser=1, to one that does. */ + + log_exec_info(context, + params, + "Found pre-existing public %s= directory %s, migrating to %s.\n" + "Apparently, service previously had DynamicUser= turned off, and has now turned it on.", + exec_directory_type_to_string(type), p, pp); + + r = RET_NERRNO(rename(p, pp)); + if (r < 0) + goto fail; + } else { + /* Otherwise, create the actual directory for the service */ + + r = mkdir_label(pp, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + } + + if (!context->directories[type].items[i].only_create) { + /* And link it up from the original place. + * Notes + * 1) If a mount namespace is going to be used, then this symlink remains on + * the host, and a new one for the child namespace will be created later. + * 2) It is not necessary to create this symlink when one of its parent + * directories is specified and already created. E.g. + * StateDirectory=foo foo/bar + * In that case, the inode points to pp and p for "foo/bar" are the same: + * pp = "/var/lib/private/foo/bar" + * p = "/var/lib/foo/bar" + * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only + * we do not need to create the symlink, but we cannot create the symlink. + * See issue #24783. */ + r = symlink_idempotent(pp, p, true); + if (r < 0) + goto fail; + } + + } else { + _cleanup_free_ char *target = NULL; + + if (type != EXEC_DIRECTORY_CONFIGURATION && + readlink_and_make_absolute(p, &target) >= 0) { + _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL; + + /* This already exists and is a symlink? Interesting. Maybe it's one created + * by DynamicUser=1 (see above)? + * + * We do this for all directory types except for ConfigurationDirectory=, + * since they all support the private/ symlink logic at least in some + * configurations, see above. */ + + r = chase(target, NULL, 0, &target_resolved, NULL); + if (r < 0) + goto fail; + + q = path_join(params->prefix[type], "private", context->directories[type].items[i].path); + if (!q) { + r = -ENOMEM; + goto fail; + } + + /* /var/lib or friends may be symlinks. So, let's chase them also. */ + r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL); + if (r < 0) + goto fail; + + if (path_equal(q_resolved, target_resolved)) { + + /* Hmm, apparently DynamicUser= was once turned on for this service, + * but is no longer. Let's move the directory back up. */ + + log_exec_info(context, + params, + "Found pre-existing private %s= directory %s, migrating to %s.\n" + "Apparently, service previously had DynamicUser= turned on, and has now turned it off.", + exec_directory_type_to_string(type), q, p); + + r = RET_NERRNO(unlink(p)); + if (r < 0) + goto fail; + + r = RET_NERRNO(rename(q, p)); + if (r < 0) + goto fail; + } + } + + r = mkdir_label(p, context->directories[type].mode); + if (r < 0) { + if (r != -EEXIST) + goto fail; + + if (type == EXEC_DIRECTORY_CONFIGURATION) { + struct stat st; + + /* Don't change the owner/access mode of the configuration directory, + * as in the common case it is not written to by a service, and shall + * not be writable. */ + + r = RET_NERRNO(stat(p, &st)); + if (r < 0) + goto fail; + + /* Still complain if the access mode doesn't match */ + if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0) + log_exec_warning(context, + params, + "%s \'%s\' already exists but the mode is different. " + "(File system: %o %sMode: %o)", + exec_directory_type_to_string(type), context->directories[type].items[i].path, + st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777); + + continue; + } + } + } + + /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't + * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the + * current UID/GID ownership.) */ + r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID); + if (r < 0) + goto fail; + + /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not + * available to user code anyway */ + if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM) + continue; + + /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we + * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID + * assignments to exist. */ + r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW); + if (r < 0) + goto fail; + } + + /* If we are not going to run in a namespace, set up the symlinks - otherwise + * they are set up later, to allow configuring empty var/run/etc. */ + if (!needs_mount_namespace) + for (size_t i = 0; i < context->directories[type].n_items; i++) { + r = create_many_symlinks(params->prefix[type], + context->directories[type].items[i].path, + context->directories[type].items[i].symlinks); + if (r < 0) + goto fail; + } + + return 0; + +fail: + *exit_status = exit_status_table[type]; + return r; +} + +#if ENABLE_SMACK +static int setup_smack( + const ExecParameters *params, + const ExecContext *context, + int executable_fd) { + int r; + + assert(params); + assert(executable_fd >= 0); + + if (context->smack_process_label) { + r = mac_smack_apply_pid(0, context->smack_process_label); + if (r < 0) + return r; + } else if (params->fallback_smack_process_label) { + _cleanup_free_ char *exec_label = NULL; + + r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + return r; + + r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label); + if (r < 0) + return r; + } + + return 0; +} +#endif + +static int compile_bind_mounts( + const ExecContext *context, + const ExecParameters *params, + BindMount **ret_bind_mounts, + size_t *ret_n_bind_mounts, + char ***ret_empty_directories) { + + _cleanup_strv_free_ char **empty_directories = NULL; + BindMount *bind_mounts = NULL; + size_t n, h = 0; + int r; + + assert(context); + assert(params); + assert(ret_bind_mounts); + assert(ret_n_bind_mounts); + assert(ret_empty_directories); + + CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many); + + n = context->n_bind_mounts; + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + for (size_t i = 0; i < context->directories[t].n_items; i++) + n += !context->directories[t].items[i].only_create; + } + + if (n <= 0) { + *ret_bind_mounts = NULL; + *ret_n_bind_mounts = 0; + *ret_empty_directories = NULL; + return 0; + } + + bind_mounts = new(BindMount, n); + if (!bind_mounts) + return -ENOMEM; + + for (size_t i = 0; i < context->n_bind_mounts; i++) { + BindMount *item = context->bind_mounts + i; + _cleanup_free_ char *s = NULL, *d = NULL; + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + d = strdup(item->destination); + if (!d) + return -ENOMEM; + + bind_mounts[h++] = (BindMount) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .read_only = item->read_only, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + } + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + if (context->directories[t].n_items == 0) + continue; + + if (exec_directory_is_private(context, t) && + !exec_context_with_rootfs(context)) { + char *private_root; + + /* So this is for a dynamic user, and we need to make sure the process can access its own + * directory. For that we overmount the usually inaccessible "private" subdirectory with a + * tmpfs that makes it accessible and is empty except for the submounts we do this for. */ + + private_root = path_join(params->prefix[t], "private"); + if (!private_root) + return -ENOMEM; + + r = strv_consume(&empty_directories, private_root); + if (r < 0) + return r; + } + + for (size_t i = 0; i < context->directories[t].n_items; i++) { + _cleanup_free_ char *s = NULL, *d = NULL; + + /* When one of the parent directories is in the list, we cannot create the symlink + * for the child directory. See also the comments in setup_exec_directory(). */ + if (context->directories[t].items[i].only_create) + continue; + + if (exec_directory_is_private(context, t)) + s = path_join(params->prefix[t], "private", context->directories[t].items[i].path); + else + s = path_join(params->prefix[t], context->directories[t].items[i].path); + if (!s) + return -ENOMEM; + + if (exec_directory_is_private(context, t) && + exec_context_with_rootfs(context)) + /* When RootDirectory= or RootImage= are set, then the symbolic link to the private + * directory is not created on the root directory. So, let's bind-mount the directory + * on the 'non-private' place. */ + d = path_join(params->prefix[t], context->directories[t].items[i].path); + else + d = strdup(s); + if (!d) + return -ENOMEM; + + bind_mounts[h++] = (BindMount) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .read_only = false, + .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */ + .recursive = true, + .ignore_enoent = false, + }; + } + } + + assert(h == n); + + *ret_bind_mounts = TAKE_PTR(bind_mounts); + *ret_n_bind_mounts = n; + *ret_empty_directories = TAKE_PTR(empty_directories); + + return (int) n; +} + +/* ret_symlinks will contain a list of pairs src:dest that describes + * the symlinks to create later on. For example, the symlinks needed + * to safely give private directories to DynamicUser=1 users. */ +static int compile_symlinks( + const ExecContext *context, + const ExecParameters *params, + bool setup_os_release_symlink, + char ***ret_symlinks) { + + _cleanup_strv_free_ char **symlinks = NULL; + int r; + + assert(context); + assert(params); + assert(ret_symlinks); + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + for (size_t i = 0; i < context->directories[dt].n_items; i++) { + _cleanup_free_ char *private_path = NULL, *path = NULL; + + STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) { + _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL; + + src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path); + dst_abs = path_join(params->prefix[dt], *symlink); + if (!src_abs || !dst_abs) + return -ENOMEM; + + r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs)); + if (r < 0) + return r; + } + + if (!exec_directory_is_private(context, dt) || + exec_context_with_rootfs(context) || + context->directories[dt].items[i].only_create) + continue; + + private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path); + if (!private_path) + return -ENOMEM; + + path = path_join(params->prefix[dt], context->directories[dt].items[i].path); + if (!path) + return -ENOMEM; + + r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path)); + if (r < 0) + return r; + } + } + + /* We make the host's os-release available via a symlink, so that we can copy it atomically + * and readers will never get a half-written version. Note that, while the paths specified here are + * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.: + * 'os-release -> .os-release-stage/os-release' is what will be created. */ + if (setup_os_release_symlink) { + r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release"); + if (r < 0) + return r; + + r = strv_extend(&symlinks, "/run/host/os-release"); + if (r < 0) + return r; + } + + *ret_symlinks = TAKE_PTR(symlinks); + + return 0; +} + +static bool insist_on_sandboxing( + const ExecContext *context, + const char *root_dir, + const char *root_image, + const BindMount *bind_mounts, + size_t n_bind_mounts) { + + assert(context); + assert(n_bind_mounts == 0 || bind_mounts); + + /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that + * would alter the view on the file system beyond making things read-only or invisible, i.e. would + * rearrange stuff in a way we cannot ignore gracefully. */ + + if (context->n_temporary_filesystems > 0) + return true; + + if (root_dir || root_image) + return true; + + if (context->n_mount_images > 0) + return true; + + if (context->dynamic_user) + return true; + + if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories)) + return true; + + /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes + * essential. */ + for (size_t i = 0; i < n_bind_mounts; i++) + if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination)) + return true; + + if (context->log_namespace) + return true; + + return false; +} + +static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) { + _cleanup_close_ int fd = -EBADF; + int r; + + if (!runtime || !runtime->ephemeral_copy) + return 0; + + r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX); + if (r < 0) + return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m"); + + CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]); + + fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT); + if (fd >= 0) + /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */ + return 0; + + if (fd != -EAGAIN) + return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m"); + + log_debug("Making ephemeral snapshot of %s to %s", + context->root_image ?: context->root_directory, runtime->ephemeral_copy); + + if (context->root_image) + fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600, + COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME); + else + fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory, + AT_FDCWD, runtime->ephemeral_copy, + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_LOCK_BSD); + if (fd < 0) + return log_debug_errno(fd, "Failed to snapshot %s to %s: %m", + context->root_image ?: context->root_directory, runtime->ephemeral_copy); + + if (context->root_image) { + /* A root image might be subject to lots of random writes so let's try to disable COW on it + * which tends to not perform well in combination with lots of random writes. + * + * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed + * copy, but we at least want to make the intention clear. + */ + r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + if (r < 0) + log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy); + } + + r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT); + if (r < 0) + return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m"); + + return 1; +} + +static int verity_settings_prepare( + VeritySettings *verity, + const char *root_image, + const void *root_hash, + size_t root_hash_size, + const char *root_hash_path, + const void *root_hash_sig, + size_t root_hash_sig_size, + const char *root_hash_sig_path, + const char *verity_data_path) { + + int r; + + assert(verity); + + if (root_hash) { + void *d; + + d = memdup(root_hash, root_hash_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash, d); + verity->root_hash_size = root_hash_size; + verity->designator = PARTITION_ROOT; + } + + if (root_hash_sig) { + void *d; + + d = memdup(root_hash_sig, root_hash_sig_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash_sig, d); + verity->root_hash_sig_size = root_hash_sig_size; + verity->designator = PARTITION_ROOT; + } + + if (verity_data_path) { + r = free_and_strdup(&verity->data_path, verity_data_path); + if (r < 0) + return r; + } + + r = verity_settings_load( + verity, + root_image, + root_hash_path, + root_hash_sig_path); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + return 0; +} + +static int apply_mount_namespace( + ExecCommandFlags command_flags, + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + const char *memory_pressure_path, + char **error_path) { + + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, + **read_write_paths_cleanup = NULL; + _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL, + *extension_dir = NULL, *host_os_release_stage = NULL; + const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; + char **read_write_paths; + bool needs_sandboxing, setup_os_release_symlink; + BindMount *bind_mounts = NULL; + size_t n_bind_mounts = 0; + int r; + + assert(context); + + CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many); + + if (params->flags & EXEC_APPLY_CHROOT) { + r = setup_ephemeral(context, runtime); + if (r < 0) + return r; + + if (context->root_image) + root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image; + else + root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory; + } + + r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); + if (r < 0) + return r; + + /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the + * service will need to write to it in order to start the notifications. */ + if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { + read_write_paths_cleanup = strv_copy(context->read_write_paths); + if (!read_write_paths_cleanup) + return -ENOMEM; + + r = strv_extend(&read_write_paths_cleanup, memory_pressure_path); + if (r < 0) + return r; + + read_write_paths = read_write_paths_cleanup; + } else + read_write_paths = context->read_write_paths; + + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED); + if (needs_sandboxing) { + /* The runtime struct only contains the parent of the private /tmp, which is non-accessible + * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to + * use here. This does not apply when we are using /run/systemd/empty as fallback. */ + + if (context->private_tmp && runtime && runtime->shared) { + if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY)) + tmp_dir = runtime->shared->tmp_dir; + else if (runtime->shared->tmp_dir) + tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp"); + + if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY)) + var_tmp_dir = runtime->shared->var_tmp_dir; + else if (runtime->shared->var_tmp_dir) + var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp"); + } + } + + /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */ + setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image); + r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks); + if (r < 0) + return r; + + if (context->mount_propagation_flag == MS_SHARED) + log_exec_debug(context, + params, + "shared mount propagation hidden by other fs namespacing unit settings: ignoring"); + + if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) { + r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path); + if (r < 0) + return r; + } + + if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) { + propagate_dir = path_join("/run/systemd/propagate/", params->unit_id); + if (!propagate_dir) + return -ENOMEM; + + incoming_dir = strdup("/run/systemd/incoming"); + if (!incoming_dir) + return -ENOMEM; + + extension_dir = strdup("/run/systemd/unit-extensions"); + if (!extension_dir) + return -ENOMEM; + + /* If running under a different root filesystem, propagate the host's os-release. We make a + * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */ + if (setup_os_release_symlink) { + host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage"); + if (!host_os_release_stage) + return -ENOMEM; + } + } else { + assert(params->runtime_scope == RUNTIME_SCOPE_USER); + + if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) + return -ENOMEM; + + if (setup_os_release_symlink) { + if (asprintf(&host_os_release_stage, + "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage", + geteuid()) < 0) + return -ENOMEM; + } + } + + if (root_image) { + r = verity_settings_prepare( + &verity, + root_image, + context->root_hash, context->root_hash_size, context->root_hash_path, + context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path, + context->root_verity); + if (r < 0) + return r; + } + + NamespaceParameters parameters = { + .runtime_scope = params->runtime_scope, + + .root_directory = root_dir, + .root_image = root_image, + .root_image_options = context->root_image_options, + .root_image_policy = context->root_image_policy ?: &image_policy_service, + + .read_write_paths = read_write_paths, + .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL, + .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL, + + .exec_paths = needs_sandboxing ? context->exec_paths : NULL, + .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL, + + .empty_directories = empty_directories, + .symlinks = symlinks, + + .bind_mounts = bind_mounts, + .n_bind_mounts = n_bind_mounts, + + .temporary_filesystems = context->temporary_filesystems, + .n_temporary_filesystems = context->n_temporary_filesystems, + + .mount_images = context->mount_images, + .n_mount_images = context->n_mount_images, + .mount_image_policy = context->mount_image_policy ?: &image_policy_service, + + .tmp_dir = tmp_dir, + .var_tmp_dir = var_tmp_dir, + + .creds_path = creds_path, + .log_namespace = context->log_namespace, + .mount_propagation_flag = context->mount_propagation_flag, + + .verity = &verity, + + .extension_images = context->extension_images, + .n_extension_images = context->n_extension_images, + .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext, + .extension_directories = context->extension_directories, + + .propagate_dir = propagate_dir, + .incoming_dir = incoming_dir, + .extension_dir = extension_dir, + .notify_socket = root_dir || root_image ? params->notify_socket : NULL, + .host_os_release_stage = host_os_release_stage, + + /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info, + * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the + * sandbox inside the mount namespace. */ + .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir, + + .protect_control_groups = needs_sandboxing && context->protect_control_groups, + .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables, + .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules, + .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs, + .protect_hostname = needs_sandboxing && context->protect_hostname, + + .private_dev = needs_sandboxing && context->private_devices, + .private_network = needs_sandboxing && exec_needs_network_namespace(context), + .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context), + + .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context), + + /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */ + .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(), + + .protect_home = needs_sandboxing ? context->protect_home : false, + .protect_system = needs_sandboxing ? context->protect_system : false, + .protect_proc = needs_sandboxing ? context->protect_proc : false, + .proc_subset = needs_sandboxing ? context->proc_subset : false, + }; + + r = setup_namespace(¶meters, error_path); + /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports + * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively + * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a + * completely different execution environment. */ + if (r == -ENOANO) { + if (insist_on_sandboxing( + context, + root_dir, root_image, + bind_mounts, + n_bind_mounts)) + return log_exec_debug_errno(context, + params, + SYNTHETIC_ERRNO(EOPNOTSUPP), + "Failed to set up namespace, and refusing to continue since " + "the selected namespacing options alter mount environment non-trivially.\n" + "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s", + n_bind_mounts, + context->n_temporary_filesystems, + yes_no(root_dir), + yes_no(root_image), + yes_no(context->dynamic_user)); + + log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring."); + return 0; + } + + return r; +} + +static int apply_working_directory( + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + const char *home, + int *exit_status) { + + const char *d, *wd; + + assert(context); + assert(exit_status); + + if (context->working_directory_home) { + + if (!home) { + *exit_status = EXIT_CHDIR; + return -ENXIO; + } + + wd = home; + + } else + wd = empty_to_root(context->working_directory); + + if (params->flags & EXEC_APPLY_CHROOT) + d = wd; + else + d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd); + + if (chdir(d) < 0 && !context->working_directory_missing_ok) { + *exit_status = EXIT_CHDIR; + return -errno; + } + + return 0; +} + +static int apply_root_directory( + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + const bool needs_mount_ns, + int *exit_status) { + + assert(context); + assert(exit_status); + + if (params->flags & EXEC_APPLY_CHROOT) + if (!needs_mount_ns && context->root_directory) + if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) { + *exit_status = EXIT_CHROOT; + return -errno; + } + + return 0; +} + +static int setup_keyring( + const ExecContext *context, + const ExecParameters *p, + uid_t uid, gid_t gid) { + + key_serial_t keyring; + int r = 0; + uid_t saved_uid; + gid_t saved_gid; + + assert(context); + assert(p); + + /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that + * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond + * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be + * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in + * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where + * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */ + + if (context->keyring_mode == EXEC_KEYRING_INHERIT) + return 0; + + /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up + * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel + * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user + * & group is just as nasty as acquiring a reference to the user keyring. */ + + saved_uid = getuid(); + saved_gid = getgid(); + + if (gid_is_valid(gid) && gid != saved_gid) { + if (setregid(gid, -1) < 0) + return log_exec_error_errno(context, + p, + errno, + "Failed to change GID for user keyring: %m"); + } + + if (uid_is_valid(uid) && uid != saved_uid) { + if (setreuid(uid, -1) < 0) { + r = log_exec_error_errno(context, + p, + errno, + "Failed to change UID for user keyring: %m"); + goto out; + } + } + + keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0); + if (keyring == -1) { + if (errno == ENOSYS) + log_exec_debug_errno(context, + p, + errno, + "Kernel keyring not supported, ignoring."); + else if (ERRNO_IS_PRIVILEGE(errno)) + log_exec_debug_errno(context, + p, + errno, + "Kernel keyring access prohibited, ignoring."); + else if (errno == EDQUOT) + log_exec_debug_errno(context, + p, + errno, + "Out of kernel keyrings to allocate, ignoring."); + else + r = log_exec_error_errno(context, + p, + errno, + "Setting up kernel keyring failed: %m"); + + goto out; + } + + /* When requested link the user keyring into the session keyring. */ + if (context->keyring_mode == EXEC_KEYRING_SHARED) { + + if (keyctl(KEYCTL_LINK, + KEY_SPEC_USER_KEYRING, + KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) { + r = log_exec_error_errno(context, + p, + errno, + "Failed to link user keyring into session keyring: %m"); + goto out; + } + } + + /* Restore uid/gid back */ + if (uid_is_valid(uid) && uid != saved_uid) { + if (setreuid(saved_uid, -1) < 0) { + r = log_exec_error_errno(context, + p, + errno, + "Failed to change UID back for user keyring: %m"); + goto out; + } + } + + if (gid_is_valid(gid) && gid != saved_gid) { + if (setregid(saved_gid, -1) < 0) + return log_exec_error_errno(context, + p, + errno, + "Failed to change GID back for user keyring: %m"); + } + + /* Populate they keyring with the invocation ID by default, as original saved_uid. */ + if (!sd_id128_is_null(p->invocation_id)) { + key_serial_t key; + + key = add_key("user", + "invocation_id", + &p->invocation_id, + sizeof(p->invocation_id), + KEY_SPEC_SESSION_KEYRING); + if (key == -1) + log_exec_debug_errno(context, + p, + errno, + "Failed to add invocation ID to keyring, ignoring: %m"); + else { + if (keyctl(KEYCTL_SETPERM, key, + KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH| + KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0) + r = log_exec_error_errno(context, + p, + errno, + "Failed to restrict invocation ID permission: %m"); + } + } + +out: + /* Revert back uid & gid for the last time, and exit */ + /* no extra logging, as only the first already reported error matters */ + if (getuid() != saved_uid) + (void) setreuid(saved_uid, -1); + + if (getgid() != saved_gid) + (void) setregid(saved_gid, -1); + + return r; +} + +static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) { + assert(array); + assert(n); + assert(pair); + + if (pair[0] >= 0) + array[(*n)++] = pair[0]; + if (pair[1] >= 0) + array[(*n)++] = pair[1]; +} + +static int close_remaining_fds( + const ExecParameters *params, + const ExecRuntime *runtime, + int socket_fd, + const int *fds, size_t n_fds) { + + size_t n_dont_close = 0; + int dont_close[n_fds + 14]; + + assert(params); + + if (params->stdin_fd >= 0) + dont_close[n_dont_close++] = params->stdin_fd; + if (params->stdout_fd >= 0) + dont_close[n_dont_close++] = params->stdout_fd; + if (params->stderr_fd >= 0) + dont_close[n_dont_close++] = params->stderr_fd; + + if (socket_fd >= 0) + dont_close[n_dont_close++] = socket_fd; + if (n_fds > 0) { + memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); + n_dont_close += n_fds; + } + + if (runtime) + append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket); + + if (runtime && runtime->shared) { + append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket); + append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket); + } + + if (runtime && runtime->dynamic_creds) { + if (runtime->dynamic_creds->user) + append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket); + if (runtime->dynamic_creds->group) + append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket); + } + + if (params->user_lookup_fd >= 0) + dont_close[n_dont_close++] = params->user_lookup_fd; + + return close_all_fds(dont_close, n_dont_close); +} + +static int send_user_lookup( + const char *unit_id, + int user_lookup_fd, + uid_t uid, + gid_t gid) { + + assert(unit_id); + + /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID + * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was + * specified. */ + + if (user_lookup_fd < 0) + return 0; + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + if (writev(user_lookup_fd, + (struct iovec[]) { + IOVEC_MAKE(&uid, sizeof(uid)), + IOVEC_MAKE(&gid, sizeof(gid)), + IOVEC_MAKE_STRING(unit_id) }, 3) < 0) + return -errno; + + return 0; +} + +static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) { + int r; + + assert(c); + assert(home); + assert(buf); + + /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */ + + if (*home) + return 0; + + if (!c->working_directory_home) + return 0; + + r = get_home_dir(buf); + if (r < 0) + return r; + + *home = *buf; + return 1; +} + +static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) { + _cleanup_strv_free_ char ** list = NULL; + int r; + + assert(c); + assert(p); + assert(ret); + + assert(c->dynamic_user); + + /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for + * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special + * directories. */ + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (t == EXEC_DIRECTORY_CONFIGURATION) + continue; + + if (!p->prefix[t]) + continue; + + for (size_t i = 0; i < c->directories[t].n_items; i++) { + char *e; + + if (exec_directory_is_private(c, t)) + e = path_join(p->prefix[t], "private", c->directories[t].items[i].path); + else + e = path_join(p->prefix[t], c->directories[t].items[i].path); + if (!e) + return -ENOMEM; + + r = strv_consume(&list, e); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(list); + + return 0; +} + +static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) { + _cleanup_(cpu_set_reset) CPUSet s = {}; + int r; + + assert(c); + assert(ret); + + if (!c->numa_policy.nodes.set) { + log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring"); + return 0; + } + + r = numa_to_cpu_set(&c->numa_policy, &s); + if (r < 0) + return r; + + cpu_set_reset(ret); + + return cpu_set_add_all(ret, &s); +} + +static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) { + int r; + + assert(fds); + assert(n_fds); + assert(*n_fds < fds_size); + assert(fd); + + if (*fd < 0) + return 0; + + if (*fd < 3 + (int) *n_fds) { + /* Let's move the fd up, so that it's outside of the fd range we will use to store + * the fds we pass to the process (or which are closed only during execve). */ + + r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds); + if (r < 0) + return -errno; + + close_and_replace(*fd, r); + } + + fds[(*n_fds)++] = *fd; + return 1; +} + +static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) { + union sockaddr_union addr = { + .un.sun_family = AF_UNIX, + }; + socklen_t sa_len; + static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET }; + int r; + + assert(c); + assert(p); + assert(of); + assert(ofd >= 0); + + r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd)); + if (r < 0) + return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path); + + sa_len = r; + + for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) { + _cleanup_close_ int fd = -EBADF; + + fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0); + if (fd < 0) + return log_exec_error_errno(c, + p, + errno, + "Failed to create socket for %s: %m", + of->path); + + r = RET_NERRNO(connect(fd, &addr.sa, sa_len)); + if (r == -EPROTOTYPE) + continue; + if (r < 0) + return log_exec_error_errno(c, + p, + r, + "Failed to connect socket for %s: %m", + of->path); + + return TAKE_FD(fd); + } + + return log_exec_error_errno(c, + p, + SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", + of->path); +} + +static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) { + struct stat st; + _cleanup_close_ int fd = -EBADF, ofd = -EBADF; + + assert(c); + assert(p); + assert(of); + + ofd = open(of->path, O_PATH | O_CLOEXEC); + if (ofd < 0) + return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path); + + if (fstat(ofd, &st) < 0) + return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path); + + if (S_ISSOCK(st.st_mode)) { + fd = connect_unix_harder(c, p, of, ofd); + if (fd < 0) + return fd; + + if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0) + return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m", + of->path); + + log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd); + } else { + int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR; + if (FLAGS_SET(of->flags, OPENFILE_APPEND)) + flags |= O_APPEND; + else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE)) + flags |= O_TRUNC; + + fd = fd_reopen(ofd, flags | O_CLOEXEC); + if (fd < 0) + return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path); + + log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd); + } + + return TAKE_FD(fd); +} + +static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) { + int r; + + assert(c); + assert(p); + assert(n_fds); + + LIST_FOREACH(open_files, of, p->open_files) { + _cleanup_close_ int fd = -EBADF; + + fd = get_open_file_fd(c, p, of); + if (fd < 0) { + if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) { + log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path); + continue; + } + + return fd; + } + + if (!GREEDY_REALLOC(p->fds, *n_fds + 1)) + return -ENOMEM; + + r = strv_extend(&p->fd_names, of->fdname); + if (r < 0) + return r; + + p->fds[*n_fds] = TAKE_FD(fd); + + (*n_fds)++; + } + + return 0; +} + +static void log_command_line( + const ExecContext *context, + const ExecParameters *params, + const char *msg, + const char *executable, + char **argv) { + + assert(context); + assert(params); + assert(msg); + assert(executable); + + if (!DEBUG_LOGGING) + return; + + _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY); + + log_exec_struct(context, params, LOG_DEBUG, + "EXECUTABLE=%s", executable, + LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)), + LOG_EXEC_INVOCATION_ID(params)); +} + +static bool exec_context_need_unprivileged_private_users( + const ExecContext *context, + const ExecParameters *params) { + + assert(context); + assert(params); + + /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace + * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN + * (system manager) then we have privileges and don't need this. */ + if (params->runtime_scope != RUNTIME_SCOPE_USER) + return false; + + return context->private_users || + context->private_tmp || + context->private_devices || + context->private_network || + context->network_namespace_path || + context->private_ipc || + context->ipc_namespace_path || + context->private_mounts > 0 || + context->mount_apivfs || + context->n_bind_mounts > 0 || + context->n_temporary_filesystems > 0 || + context->root_directory || + !strv_isempty(context->extension_directories) || + context->protect_system != PROTECT_SYSTEM_NO || + context->protect_home != PROTECT_HOME_NO || + context->protect_kernel_tunables || + context->protect_kernel_modules || + context->protect_kernel_logs || + context->protect_control_groups || + context->protect_clock || + context->protect_hostname || + !strv_isempty(context->read_write_paths) || + !strv_isempty(context->read_only_paths) || + !strv_isempty(context->inaccessible_paths) || + !strv_isempty(context->exec_paths) || + !strv_isempty(context->no_exec_paths); +} + +static bool exec_context_shall_confirm_spawn(const ExecContext *context) { + assert(context); + + if (confirm_spawn_disabled()) + return false; + + /* For some reasons units remaining in the same process group + * as PID 1 fail to acquire the console even if it's not used + * by any process. So skip the confirmation question for them. */ + return !context->same_pgrp; +} + +static int exec_context_named_iofds( + const ExecContext *c, + const ExecParameters *p, + int named_iofds[static 3]) { + + size_t targets; + const char* stdio_fdname[3]; + size_t n_fds; + + assert(c); + assert(p); + assert(named_iofds); + + targets = (c->std_input == EXEC_INPUT_NAMED_FD) + + (c->std_output == EXEC_OUTPUT_NAMED_FD) + + (c->std_error == EXEC_OUTPUT_NAMED_FD); + + for (size_t i = 0; i < 3; i++) + stdio_fdname[i] = exec_context_fdname(c, i); + + n_fds = p->n_storage_fds + p->n_socket_fds; + + for (size_t i = 0; i < n_fds && targets > 0; i++) + if (named_iofds[STDIN_FILENO] < 0 && + c->std_input == EXEC_INPUT_NAMED_FD && + stdio_fdname[STDIN_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) { + + named_iofds[STDIN_FILENO] = p->fds[i]; + targets--; + + } else if (named_iofds[STDOUT_FILENO] < 0 && + c->std_output == EXEC_OUTPUT_NAMED_FD && + stdio_fdname[STDOUT_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) { + + named_iofds[STDOUT_FILENO] = p->fds[i]; + targets--; + + } else if (named_iofds[STDERR_FILENO] < 0 && + c->std_error == EXEC_OUTPUT_NAMED_FD && + stdio_fdname[STDERR_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) { + + named_iofds[STDERR_FILENO] = p->fds[i]; + targets--; + } + + return targets == 0 ? 0 : -ENOENT; +} + +static void exec_shared_runtime_close(ExecSharedRuntime *shared) { + if (!shared) + return; + + safe_close_pair(shared->netns_storage_socket); + safe_close_pair(shared->ipcns_storage_socket); +} + +static void exec_runtime_close(ExecRuntime *rt) { + if (!rt) + return; + + safe_close_pair(rt->ephemeral_storage_socket); + + exec_shared_runtime_close(rt->shared); + dynamic_creds_close(rt->dynamic_creds); +} + +static void exec_params_close(ExecParameters *p) { + if (!p) + return; + + p->stdin_fd = safe_close(p->stdin_fd); + p->stdout_fd = safe_close(p->stdout_fd); + p->stderr_fd = safe_close(p->stderr_fd); +} + +int exec_invoke( + const ExecCommand *command, + const ExecContext *context, + ExecParameters *params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + int *exit_status) { + + _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL; + int r, ngids = 0; + _cleanup_free_ gid_t *supplementary_gids = NULL; + const char *username = NULL, *groupname = NULL; + _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL; + const char *home = NULL, *shell = NULL; + char **final_argv = NULL; + dev_t journal_stream_dev = 0; + ino_t journal_stream_ino = 0; + bool userns_set_up = false; + bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */ + needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ + needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ + needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */ + bool keep_seccomp_privileges = false; +#if HAVE_SELINUX + _cleanup_free_ char *mac_selinux_context_net = NULL; + bool use_selinux = false; +#endif +#if ENABLE_SMACK + bool use_smack = false; +#endif +#if HAVE_APPARMOR + bool use_apparmor = false; +#endif +#if HAVE_SECCOMP + uint64_t saved_bset = 0; +#endif + uid_t saved_uid = getuid(); + gid_t saved_gid = getgid(); + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + size_t n_fds, /* fds to pass to the child */ + n_keep_fds; /* total number of fds not to close */ + int secure_bits; + _cleanup_free_ gid_t *gids_after_pam = NULL; + int ngids_after_pam = 0; + + int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET; + size_t n_storage_fds, n_socket_fds; + + assert(command); + assert(context); + assert(params); + assert(exit_status); + + if (context->log_level_max >= 0) + log_set_max_level(context->log_level_max); + + /* Explicitly test for CVE-2021-4034 inspired invocations */ + if (!command->path || strv_isempty(command->argv)) { + *exit_status = EXIT_EXEC; + return log_exec_error_errno( + context, + params, + SYNTHETIC_ERRNO(EINVAL), + "Invalid command line arguments."); + } + + LOG_CONTEXT_PUSH_EXEC(context, params); + + if (context->std_input == EXEC_INPUT_SOCKET || + context->std_output == EXEC_OUTPUT_SOCKET || + context->std_error == EXEC_OUTPUT_SOCKET) { + + if (params->n_socket_fds > 1) + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket."); + + if (params->n_socket_fds == 0) + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket."); + + socket_fd = params->fds[0]; + n_storage_fds = n_socket_fds = 0; + } else { + n_socket_fds = params->n_socket_fds; + n_storage_fds = params->n_storage_fds; + } + n_fds = n_socket_fds + n_storage_fds; + + r = exec_context_named_iofds(context, params, named_iofds); + if (r < 0) + return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m"); + + rename_process_from_path(command->path); + + /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main + * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially, + * both of which will be demoted to SIG_DFL. */ + (void) default_signals(SIGNALS_CRASH_HANDLER, + SIGNALS_IGNORE); + + if (context->ignore_sigpipe) + (void) ignore_signals(SIGPIPE); + + r = reset_signal_mask(); + if (r < 0) { + *exit_status = EXIT_SIGNAL_MASK; + return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m"); + } + + if (params->idle_pipe) + do_idle_pipe_dance(params->idle_pipe); + + /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its + * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have + * any fds open we don't really want open during the transition. In order to make logging work, we switch the + * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */ + + log_forget_fds(); + log_set_open_when_needed(true); + log_settle_target(); + + /* In case anything used libc syslog(), close this here, too */ + closelog(); + + r = collect_open_file_fds(context, params, &n_fds); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m"); + } + + int keep_fds[n_fds + 3]; + memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int)); + n_keep_fds = n_fds; + + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->exec_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); + } + +#if HAVE_LIBBPF + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->bpf_outer_map_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); + } +#endif + + r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m"); + } + + if (!context->same_pgrp && + setsid() < 0) { + *exit_status = EXIT_SETSID; + return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m"); + } + + exec_context_tty_reset(context, params); + + if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) { + _cleanup_free_ char *cmdline = NULL; + + cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY); + if (!cmdline) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = ask_for_confirmation(context, params, cmdline); + if (r != CONFIRM_EXECUTE) { + if (r == CONFIRM_PRETEND_SUCCESS) { + *exit_status = EXIT_SUCCESS; + return 0; + } + + *exit_status = EXIT_CONFIRM; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED), + "Execution cancelled by the user"); + } + } + + /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is + * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note + * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS + * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they + * might internally call into other NSS modules that are involved in hostname resolution, we never know. */ + if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 || + setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) { + *exit_status = EXIT_MEMORY; + return log_exec_error_errno(context, params, errno, "Failed to update environment: %m"); + } + + if (context->dynamic_user && runtime && runtime->dynamic_creds) { + _cleanup_strv_free_ char **suggested_paths = NULL; + + /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS + * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */ + if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, errno, "Failed to update environment: %m"); + } + + r = compile_suggested_paths(context, params, &suggested_paths); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid); + if (r < 0) { + *exit_status = EXIT_USER; + if (r == -EILSEQ) + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Failed to update dynamic user credentials: User or group with specified name already exists."); + return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m"); + } + + if (!uid_is_valid(uid)) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid); + } + + if (!gid_is_valid(gid)) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid); + } + + if (runtime->dynamic_creds->user) + username = runtime->dynamic_creds->user->name; + + } else { + if (context->user) { + r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m"); + } + } + + if (context->group) { + r = get_fixed_group(context->group, &groupname, &gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m"); + } + } + } + + /* Initialize user supplementary groups and get SupplementaryGroups= ones */ + r = get_supplementary_groups(context, username, groupname, gid, + &supplementary_gids, &ngids); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m"); + } + + r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m"); + } + + params->user_lookup_fd = safe_close(params->user_lookup_fd); + + r = acquire_home(context, uid, &home, &home_buffer); + if (r < 0) { + *exit_status = EXIT_CHDIR; + return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m"); + } + + /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */ + if (socket_fd >= 0) + (void) fd_nonblock(socket_fd, false); + + /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields. + * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */ + if (params->cgroup_path) { + _cleanup_free_ char *p = NULL; + + r = exec_params_get_cgroup_path(params, cgroup_context, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m"); + } + + r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL); + if (r == -EUCLEAN) { + *exit_status = EXIT_CGROUP; + return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s " + "because the cgroup or one of its parents or " + "siblings is in the threaded mode: %m", p); + } + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p); + } + } + + if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) { + r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET); + if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path); + } + } + + if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) { + r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path); + } + } + + r = setup_input(context, params, socket_fd, named_iofds); + if (r < 0) { + *exit_status = EXIT_STDIN; + return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m"); + } + + r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + if (r < 0) { + *exit_status = EXIT_STDOUT; + return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m"); + } + + r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + if (r < 0) { + *exit_status = EXIT_STDERR; + return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m"); + } + + if (context->oom_score_adjust_set) { + /* When we can't make this change due to EPERM, then let's silently skip over it. User + * namespaces prohibit write access to this file, and we shouldn't trip up over that. */ + r = set_oom_score_adjust(context->oom_score_adjust); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + log_exec_debug_errno(context, params, r, + "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m"); + else if (r < 0) { + *exit_status = EXIT_OOM_ADJUST; + return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m"); + } + } + + if (context->coredump_filter_set) { + r = set_coredump_filter(context->coredump_filter); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m"); + else if (r < 0) { + *exit_status = EXIT_LIMITS; + return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m"); + } + } + + if (context->nice_set) { + r = setpriority_closest(context->nice); + if (r < 0) { + *exit_status = EXIT_NICE; + return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m"); + } + } + + if (context->cpu_sched_set) { + struct sched_param param = { + .sched_priority = context->cpu_sched_priority, + }; + + r = sched_setscheduler(0, + context->cpu_sched_policy | + (context->cpu_sched_reset_on_fork ? + SCHED_RESET_ON_FORK : 0), + ¶m); + if (r < 0) { + *exit_status = EXIT_SETSCHEDULER; + return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m"); + } + } + + if (context->cpu_affinity_from_numa || context->cpu_set.set) { + _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {}; + const CPUSet *cpu_set; + + if (context->cpu_affinity_from_numa) { + r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set); + if (r < 0) { + *exit_status = EXIT_CPUAFFINITY; + return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m"); + } + + cpu_set = &converted_cpu_set; + } else + cpu_set = &context->cpu_set; + + if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) { + *exit_status = EXIT_CPUAFFINITY; + return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m"); + } + } + + if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) { + r = apply_numa_policy(&context->numa_policy); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring."); + else if (r < 0) { + *exit_status = EXIT_NUMA_POLICY; + return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m"); + } + } + + if (context->ioprio_set) + if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) { + *exit_status = EXIT_IOPRIO; + return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m"); + } + + if (context->timer_slack_nsec != NSEC_INFINITY) + if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) { + *exit_status = EXIT_TIMERSLACK; + return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m"); + } + + if (context->personality != PERSONALITY_INVALID) { + r = safe_personality(context->personality); + if (r < 0) { + *exit_status = EXIT_PERSONALITY; + return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m"); + } + } + +#if ENABLE_UTMP + if (context->utmp_id) { + _cleanup_free_ char *username_alloc = NULL; + + if (!username && context->utmp_mode == EXEC_UTMP_USER) { + username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid); + if (!username_alloc) { + *exit_status = EXIT_USER; + return log_oom(); + } + } + + const char *line = context->tty_path ? + (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) : + NULL; + utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0), + line, + context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS : + context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS : + USER_PROCESS, + username ?: username_alloc); + } +#endif + + if (uid_is_valid(uid)) { + r = chown_terminal(STDIN_FILENO, uid); + if (r < 0) { + *exit_status = EXIT_STDIN; + return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m"); + } + } + + if (params->cgroup_path) { + /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 + * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not + * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only + * touch a single hierarchy too. */ + + if (params->flags & EXEC_CGROUP_DELEGATE) { + _cleanup_free_ char *p = NULL; + + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m"); + } + + r = exec_params_get_cgroup_path(params, cgroup_context, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m"); + } + if (r > 0) { + r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m"); + } + } + } + + if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) { + if (cgroup_context_want_memory_pressure(cgroup_context)) { + r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = chmod_and_chown(memory_pressure_path, 0644, uid, gid); + if (r < 0) { + log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path); + memory_pressure_path = mfree(memory_pressure_path); + } + } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) { + memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */ + if (!memory_pressure_path) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + } + } + } + + needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime); + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status); + if (r < 0) + return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]); + } + + if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) { + r = exec_setup_credentials(context, params, params->unit_id, uid, gid); + if (r < 0) { + *exit_status = EXIT_CREDENTIALS; + return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m"); + } + } + + r = build_environment( + context, + params, + cgroup_context, + n_fds, + home, + username, + shell, + journal_stream_dev, + journal_stream_ino, + memory_pressure_path, + &our_env); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = build_pass_environment(context, &pass_env); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + /* The $PATH variable is set to the default path in params->environment. However, this is overridden + * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does + * not specify PATH but the unit has ExecSearchPath. */ + if (!strv_isempty(context->exec_search_path)) { + _cleanup_free_ char *joined = NULL; + + joined = strv_join(context->exec_search_path, ":"); + if (!joined) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = strv_env_assign(&joined_exec_search_path, "PATH", joined); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + } + + accum_env = strv_env_merge(params->environment, + our_env, + joined_exec_search_path, + pass_env, + context->environment, + params->files_env); + if (!accum_env) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + accum_env = strv_env_clean(accum_env); + + (void) umask(context->umask); + + r = setup_keyring(context, params, uid, gid); + if (r < 0) { + *exit_status = EXIT_KEYRING; + return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m"); + } + + /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted + * from it. */ + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); + + /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked + * for it, and the kernel doesn't actually support ambient caps. */ + needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported(); + + /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly + * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not + * desired. */ + if (needs_ambient_hack) + needs_setuid = false; + else + needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID)); + + uint64_t capability_ambient_set = context->capability_ambient_set; + + if (needs_sandboxing) { + /* MAC enablement checks need to be done before a new mount ns is created, as they rely on + * /sys being present. The actual MAC context application will happen later, as late as + * possible, to avoid impacting our own code paths. */ + +#if HAVE_SELINUX + use_selinux = mac_selinux_use(); +#endif +#if ENABLE_SMACK + use_smack = mac_smack_use(); +#endif +#if HAVE_APPARMOR + use_apparmor = mac_apparmor_use(); +#endif + } + + if (needs_sandboxing) { + int which_failed; + + /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what + * is set here. (See below.) */ + + r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed); + if (r < 0) { + *exit_status = EXIT_LIMITS; + return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + } + } + + if (needs_setuid && context->pam_name && username) { + /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits + * wins here. (See above.) */ + + /* All fds passed in the fds array will be closed in the pam child process. */ + r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds); + if (r < 0) { + *exit_status = EXIT_PAM; + return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m"); + } + + if (ambient_capabilities_supported()) { + uint64_t ambient_after_pam; + + /* PAM modules might have set some ambient caps. Query them here and merge them into + * the caps we want to set in the end, so that we don't end up unsetting them. */ + r = capability_get_ambient(&ambient_after_pam); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m"); + } + + capability_ambient_set |= ambient_after_pam; + } + + ngids_after_pam = getgroups_alloc(&gids_after_pam); + if (ngids_after_pam < 0) { + *exit_status = EXIT_GROUP; + return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m"); + } + } + + if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) { + /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. + * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to + * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ + + r = setup_private_users(saved_uid, saved_gid, uid, gid); + /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let + * the actual requested operations fail (or silently continue). */ + if (r < 0 && context->private_users) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m"); + } + if (r < 0) + log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m"); + else + userns_set_up = true; + } + + if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) { + + /* Try to enable network namespacing if network namespacing is available and we have + * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the + * new network namespace. And if we don't have that, then we could only create a network + * namespace without the ability to set up "lo". Hence gracefully skip things then. */ + if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) { + r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + log_exec_notice_errno(context, params, r, + "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m"); + else if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m"); + } + } else if (context->network_namespace_path) { + *exit_status = EXIT_NETWORK; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP), + "NetworkNamespacePath= is not supported, refusing."); + } else + log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without."); + } + + if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) { + + if (ns_type_supported(NAMESPACE_IPC)) { + r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC); + if (r == -EPERM) + log_exec_warning_errno(context, params, r, + "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m"); + else if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m"); + } + } else if (context->ipc_namespace_path) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP), + "IPCNamespacePath= is not supported, refusing."); + } else + log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring."); + } + + if (needs_mount_namespace) { + _cleanup_free_ char *error_path = NULL; + + r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m", + error_path ? ": " : "", strempty(error_path)); + } + } + + if (needs_sandboxing) { + r = apply_protect_hostname(context, params, exit_status); + if (r < 0) + return r; + } + + if (context->memory_ksm >= 0) + if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + log_exec_debug_errno(context, + params, + errno, + "KSM support not available, ignoring."); + else { + *exit_status = EXIT_KSM; + return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m"); + } + } + + /* Drop groups as early as possible. + * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root. + * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */ + if (needs_setuid) { + _cleanup_free_ gid_t *gids_to_enforce = NULL; + int ngids_to_enforce = 0; + + ngids_to_enforce = merge_gid_lists(supplementary_gids, + ngids, + gids_after_pam, + ngids_after_pam, + &gids_to_enforce); + if (ngids_to_enforce < 0) { + *exit_status = EXIT_GROUP; + return log_exec_error_errno(context, params, + ngids_to_enforce, + "Failed to merge group lists. Group membership might be incorrect: %m"); + } + + r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m"); + } + } + + /* If the user namespace was not set up above, try to do it now. + * It's preferred to set up the user namespace later (after all other namespaces) so as not to be + * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the + * case of mount namespaces being less privileged when the mount point list is copied from a + * different user namespace). */ + + if (needs_sandboxing && context->private_users && !userns_set_up) { + r = setup_private_users(saved_uid, saved_gid, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m"); + } + } + + /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we + * shall execute. */ + + _cleanup_free_ char *executable = NULL; + _cleanup_close_ int executable_fd = -EBADF; + r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd); + if (r < 0) { + if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { + log_exec_struct_errno(context, params, LOG_INFO, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_EXEC_INVOCATION_ID(params), + LOG_EXEC_MESSAGE(params, + "Executable %s missing, skipping: %m", + command->path), + "EXECUTABLE=%s", command->path); + *exit_status = EXIT_SUCCESS; + return 0; + } + + *exit_status = EXIT_EXEC; + return log_exec_struct_errno(context, params, LOG_INFO, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_EXEC_INVOCATION_ID(params), + LOG_EXEC_MESSAGE(params, + "Failed to locate executable %s: %m", + command->path), + "EXECUTABLE=%s", command->path); + } + + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); + } + +#if HAVE_SELINUX + if (needs_sandboxing && use_selinux && params->selinux_context_net) { + int fd = -EBADF; + + if (socket_fd >= 0) + fd = socket_fd; + else if (params->n_socket_fds == 1) + /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we + * use context from that fd to compute the label. */ + fd = params->fds[0]; + + if (fd >= 0) { + r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net); + if (r < 0) { + if (!context->selinux_context_ignore) { + *exit_status = EXIT_SELINUX_CONTEXT; + return log_exec_error_errno(context, + params, + r, + "Failed to determine SELinux context: %m"); + } + log_exec_debug_errno(context, + params, + r, + "Failed to determine SELinux context, ignoring: %m"); + } + } + } +#endif + + /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that + * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any + * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final + * execve(). But first, close the remaining sockets in the context objects. */ + + exec_runtime_close(runtime); + exec_params_close(params); + + r = close_all_fds(keep_fds, n_keep_fds); + if (r >= 0) + r = shift_fds(params->fds, n_fds); + if (r >= 0) + r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m"); + } + + /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off + * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined, + * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we + * came this far. */ + + secure_bits = context->secure_bits; + + if (needs_sandboxing) { + uint64_t bset; + + /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. + * (Note this is placed after the general resource limit initialization, see above, in order + * to take precedence.) */ + if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) { + if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) { + *exit_status = EXIT_LIMITS; + return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m"); + } + } + +#if ENABLE_SMACK + /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the + * process. This is the latest place before dropping capabilities. Other MAC context are set later. */ + if (use_smack && context->smack_process_label) { + r = setup_smack(params, context, executable_fd); + if (r < 0 && !context->smack_process_label_ignore) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m"); + } + } +#endif + + bset = context->capability_bounding_set; + /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for + * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own, + * instead of us doing that */ + if (needs_ambient_hack) + bset |= (UINT64_C(1) << CAP_SETPCAP) | + (UINT64_C(1) << CAP_SETUID) | + (UINT64_C(1) << CAP_SETGID); + +#if HAVE_SECCOMP + /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll + * keep the needed privileges to apply it even if we're not root. */ + if (needs_setuid && + uid_is_valid(uid) && + context_has_seccomp(context) && + seccomp_allows_drop_privileges(context)) { + keep_seccomp_privileges = true; + + if (prctl(PR_SET_KEEPCAPS, 1) < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m"); + } + + /* Save the current bounding set so we can restore it after applying the seccomp + * filter */ + saved_bset = bset; + bset |= (UINT64_C(1) << CAP_SYS_ADMIN) | + (UINT64_C(1) << CAP_SETPCAP); + } +#endif + + if (!cap_test_all(bset)) { + r = capability_bounding_set_drop(bset, /* right_now= */ false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m"); + } + } + + /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with + * keep-caps set. + * + * To be able to raise the ambient capabilities after setresuid() they have to be added to + * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid() + * the ambient capabilities can be raised as they are present in the permitted and + * inhertiable set. However it is possible that someone wants to set ambient capabilities + * without changing the user, so we also set the ambient capabilities here. + * + * The requested ambient capabilities are raised in the inheritable set if the second + * argument is true. */ + if (!needs_ambient_hack) { + r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m"); + } + } + } + + /* chroot to root directory first, before we lose the ability to chroot */ + r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status); + if (r < 0) + return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m"); + + if (needs_setuid) { + if (uid_is_valid(uid)) { + r = enforce_user(context, uid, capability_ambient_set); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid); + } + + if (keep_seccomp_privileges) { + if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) { + r = drop_capability(CAP_SETUID); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m"); + } + } + + r = keep_capability(CAP_SYS_ADMIN); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m"); + } + + r = keep_capability(CAP_SETPCAP); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m"); + } + } + + if (!needs_ambient_hack && capability_ambient_set != 0) { + + /* Raise the ambient capabilities after user change. */ + r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m"); + } + } + } + } + + /* Apply working directory here, because the working directory might be on NFS and only the user running + * this service might have the correct privilege to change to the working directory */ + r = apply_working_directory(context, params, runtime, home, exit_status); + if (r < 0) + return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m"); + + if (needs_sandboxing) { + /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to + * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires + * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls + * are restricted. */ + +#if HAVE_SELINUX + if (use_selinux) { + char *exec_context = mac_selinux_context_net ?: context->selinux_context; + + if (exec_context) { + r = setexeccon(exec_context); + if (r < 0) { + if (!context->selinux_context_ignore) { + *exit_status = EXIT_SELINUX_CONTEXT; + return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context); + } + log_exec_debug_errno(context, + params, + r, + "Failed to change SELinux context to %s, ignoring: %m", + exec_context); + } + } + } +#endif + +#if HAVE_APPARMOR + if (use_apparmor && context->apparmor_profile) { + r = aa_change_onexec(context->apparmor_profile); + if (r < 0 && !context->apparmor_profile_ignore) { + *exit_status = EXIT_APPARMOR_PROFILE; + return log_exec_error_errno(context, + params, + errno, + "Failed to prepare AppArmor profile change to %s: %m", + context->apparmor_profile); + } + } +#endif + + /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential + * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits + * requires CAP_SETPCAP. */ + if (prctl(PR_GET_SECUREBITS) != secure_bits) { + /* CAP_SETPCAP is required to set securebits. This capability is raised into the + * effective set here. + * + * The effective set is overwritten during execve() with the following values: + * + * - ambient set (for non-root processes) + * + * - (inheritable | bounding) set for root processes) + * + * Hence there is no security impact to raise it in the effective set before execve + */ + r = capability_gain_cap_setpcap(/* return_caps= */ NULL); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits"); + } + if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) { + *exit_status = EXIT_SECUREBITS; + return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m"); + } + } + + if (context_has_no_new_privileges(context)) + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + *exit_status = EXIT_NO_NEW_PRIVILEGES; + return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m"); + } + +#if HAVE_SECCOMP + r = apply_address_families(context, params); + if (r < 0) { + *exit_status = EXIT_ADDRESS_FAMILIES; + return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m"); + } + + r = apply_memory_deny_write_execute(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m"); + } + + r = apply_restrict_realtime(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m"); + } + + r = apply_restrict_suid_sgid(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m"); + } + + r = apply_restrict_namespaces(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m"); + } + + r = apply_protect_sysctl(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m"); + } + + r = apply_protect_kernel_modules(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m"); + } + + r = apply_protect_kernel_logs(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m"); + } + + r = apply_protect_clock(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m"); + } + + r = apply_private_devices(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m"); + } + + r = apply_syscall_archs(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m"); + } + + r = apply_lock_personality(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m"); + } + + r = apply_syscall_log(context, params); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m"); + } +#endif + +#if HAVE_LIBBPF + r = apply_restrict_filesystems(context, params); + if (r < 0) { + *exit_status = EXIT_BPF; + return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m"); + } +#endif + +#if HAVE_SECCOMP + /* This really should remain as close to the execve() as possible, to make sure our own code is affected + * by the filter as little as possible. */ + r = apply_syscall_filter(context, params, needs_ambient_hack); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m"); + } + + if (keep_seccomp_privileges) { + /* Restore the capability bounding set with what's expected from the service + the + * ambient capabilities hack */ + if (!cap_test_all(saved_bset)) { + r = capability_bounding_set_drop(saved_bset, /* right_now= */ false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m"); + } + } + + /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break + * applications that use it. */ + if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) { + r = drop_capability(CAP_SYS_ADMIN); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m"); + } + } + + /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break + * applications that use it. */ + if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) { + r = drop_capability(CAP_SETPCAP); + if (r < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m"); + } + } + + if (prctl(PR_SET_KEEPCAPS, 0) < 0) { + *exit_status = EXIT_USER; + return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m"); + } + } +#endif + + } + + if (!strv_isempty(context->unset_environment)) { + char **ee = NULL; + + ee = strv_env_delete(accum_env, 1, context->unset_environment); + if (!ee) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + strv_free_and_replace(accum_env, ee); + } + + if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) { + _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL; + + r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_exec_error_errno(context, + params, + r, + "Failed to replace environment variables: %m"); + } + final_argv = replaced_argv; + + if (!strv_isempty(unset_variables)) { + _cleanup_free_ char *ju = strv_join(unset_variables, ", "); + log_exec_warning(context, + params, + "Referenced but unset environment variable evaluates to an empty string: %s", + strna(ju)); + } + + if (!strv_isempty(bad_variables)) { + _cleanup_free_ char *jb = strv_join(bad_variables, ", "); + log_exec_warning(context, + params, + "Invalid environment variable name evaluates to an empty string: %s", + strna(jb)); + } + } else + final_argv = command->argv; + + log_command_line(context, params, "Executing", executable, final_argv); + + if (params->exec_fd >= 0) { + uint8_t hot = 1; + + /* We have finished with all our initializations. Let's now let the manager know that. From this point + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + + if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m"); + } + } + + r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); + + if (params->exec_fd >= 0) { + uint8_t hot = 0; + + /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager + * that POLLHUP on it no longer means execve() succeeded. */ + + if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m"); + } + } + + *exit_status = EXIT_EXEC; + return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable); +} diff --git a/src/core/exec-invoke.h b/src/core/exec-invoke.h new file mode 100644 index 0000000..a8a3ac6 --- /dev/null +++ b/src/core/exec-invoke.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct ExecCommand ExecCommand; +typedef struct ExecContext ExecContext; +typedef struct ExecParameters ExecParameters; +typedef struct ExecRuntime ExecRuntime; +typedef struct CGroupContext CGroupContext; + +int exec_invoke( + const ExecCommand *command, + const ExecContext *context, + ExecParameters *params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + int *exit_status); diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c new file mode 100644 index 0000000..b1e716e --- /dev/null +++ b/src/core/execute-serialize.c @@ -0,0 +1,3896 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "af-list.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "escape.h" +#include "exec-credential.h" +#include "execute-serialize.h" +#include "hexdecoct.h" +#include "fd-util.h" +#include "fileio.h" +#include "in-addr-prefix-util.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "percent-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "serialize.h" +#include "string-util.h" +#include "strv.h" + +static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { + _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, + *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, + *startup_cpuset_mems = NULL; + char *iface; + struct in_addr_prefix *iaai; + int r; + + assert(f); + + if (!c) + return 0; + + r = serialize_bool_elide(f, "exec-cgroup-context-cpu-accounting", c->cpu_accounting); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-io-accounting", c->io_accounting); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-block-io-accounting", c->blockio_accounting); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-memory-accounting", c->memory_accounting); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-tasks-accounting", c->tasks_accounting); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-ip-accounting", c->ip_accounting); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-memory-oom-group", c->memory_oom_group); + if (r < 0) + return r; + + if (c->cpu_weight != CGROUP_WEIGHT_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-cpu-weight", "%" PRIu64, c->cpu_weight); + if (r < 0) + return r; + } + + if (c->startup_cpu_weight != CGROUP_WEIGHT_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-startup-cpu-weight", "%" PRIu64, c->startup_cpu_weight); + if (r < 0) + return r; + } + + if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-cpu-shares", "%" PRIu64, c->cpu_shares); + if (r < 0) + return r; + } + + if (c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-startup-cpu-shares", "%" PRIu64, c->startup_cpu_shares); + if (r < 0) + return r; + } + + if (c->cpu_quota_per_sec_usec != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-cpu-quota-per-sec-usec", c->cpu_quota_per_sec_usec); + if (r < 0) + return r; + } + + if (c->cpu_quota_period_usec != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-cpu-quota-period-usec", c->cpu_quota_period_usec); + if (r < 0) + return r; + } + + cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus); + if (!cpuset_cpus) + return log_oom_debug(); + + r = serialize_item(f, "exec-cgroup-context-allowed-cpus", cpuset_cpus); + if (r < 0) + return r; + + startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus); + if (!startup_cpuset_cpus) + return log_oom_debug(); + + r = serialize_item(f, "exec-cgroup-context-startup-allowed-cpus", startup_cpuset_cpus); + if (r < 0) + return r; + + cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems); + if (!cpuset_mems) + return log_oom_debug(); + + r = serialize_item(f, "exec-cgroup-context-allowed-memory-nodes", cpuset_mems); + if (r < 0) + return r; + + startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems); + if (!startup_cpuset_mems) + return log_oom_debug(); + + r = serialize_item(f, "exec-cgroup-context-startup-allowed-memory-nodes", startup_cpuset_mems); + if (r < 0) + return r; + + if (c->io_weight != CGROUP_WEIGHT_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-io-weight", "%" PRIu64, c->io_weight); + if (r < 0) + return r; + } + + if (c->startup_io_weight != CGROUP_WEIGHT_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-startup-io-weight", "%" PRIu64, c->startup_io_weight); + if (r < 0) + return r; + } + + if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-block-io-weight", "%" PRIu64, c->blockio_weight); + if (r < 0) + return r; + } + + if (c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) { + r = serialize_item_format(f, "exec-cgroup-context-startup-block-io-weight", "%" PRIu64, c->startup_blockio_weight); + if (r < 0) + return r; + } + + if (c->default_memory_min > 0) { + r = serialize_item_format(f, "exec-cgroup-context-default-memory-min", "%" PRIu64, c->default_memory_min); + if (r < 0) + return r; + } + + if (c->default_memory_low > 0) { + r = serialize_item_format(f, "exec-cgroup-context-default-memory-low", "%" PRIu64, c->default_memory_low); + if (r < 0) + return r; + } + + if (c->memory_min > 0) { + r = serialize_item_format(f, "exec-cgroup-context-memory-min", "%" PRIu64, c->memory_min); + if (r < 0) + return r; + } + + if (c->memory_low > 0) { + r = serialize_item_format(f, "exec-cgroup-context-memory-low", "%" PRIu64, c->memory_low); + if (r < 0) + return r; + } + + if (c->startup_memory_low > 0) { + r = serialize_item_format(f, "exec-cgroup-context-startup-memory-low", "%" PRIu64, c->startup_memory_low); + if (r < 0) + return r; + } + + if (c->memory_high != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-memory-high", "%" PRIu64, c->memory_high); + if (r < 0) + return r; + } + + if (c->startup_memory_high != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-startup-memory-high", "%" PRIu64, c->startup_memory_high); + if (r < 0) + return r; + } + + if (c->memory_max != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-memory-max", "%" PRIu64, c->memory_max); + if (r < 0) + return r; + } + + if (c->startup_memory_max != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-startup-memory-max", "%" PRIu64, c->startup_memory_max); + if (r < 0) + return r; + } + + if (c->memory_swap_max != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-memory-swap-max", "%" PRIu64, c->memory_swap_max); + if (r < 0) + return r; + } + + if (c->startup_memory_swap_max != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-startup-memory-swap-max", "%" PRIu64, c->startup_memory_swap_max); + if (r < 0) + return r; + } + + if (c->memory_zswap_max != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-memory-zswap-max", "%" PRIu64, c->memory_zswap_max); + if (r < 0) + return r; + } + + if (c->startup_memory_zswap_max != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-startup-memory-zswap-max", "%" PRIu64, c->startup_memory_zswap_max); + if (r < 0) + return r; + } + + if (c->memory_limit != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-memory-limit", "%" PRIu64, c->memory_limit); + if (r < 0) + return r; + } + + if (c->tasks_max.value != UINT64_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-tasks-max-value", "%" PRIu64, c->tasks_max.value); + if (r < 0) + return r; + } + + if (c->tasks_max.scale > 0) { + r = serialize_item_format(f, "exec-cgroup-context-tasks-max-scale", "%" PRIu64, c->tasks_max.scale); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-cgroup-context-default-memory-min-set", c->default_memory_min_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-default-memory-low-set", c->default_memory_low_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-default-startup-memory-low-set", c->default_startup_memory_low_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-memory-min-set", c->memory_min_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-memory-low-set", c->memory_low_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-low-set", c->startup_memory_low_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-high-set", c->startup_memory_high_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-max-set", c->startup_memory_max_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-swap-max-set", c->startup_memory_swap_max_set); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-zswap-max-set", c->startup_memory_zswap_max_set); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-device-policy", cgroup_device_policy_to_string(c->device_policy)); + if (r < 0) + return r; + + r = cg_mask_to_string(c->disable_controllers, &disable_controllers_str); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-disable-controllers", disable_controllers_str); + if (r < 0) + return r; + + r = cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-delegate-controllers", delegate_controllers_str); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-delegate", c->delegate); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-managed-oom-swap", managed_oom_mode_to_string(c->moom_swap)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-managed-oom-memory-pressure", managed_oom_mode_to_string(c->moom_mem_pressure)); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-cgroup-context-managed-oom-memory-pressure-limit", "%" PRIu32, c->moom_mem_pressure_limit); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-managed-oom-preference", managed_oom_preference_to_string(c->moom_preference)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->memory_pressure_watch)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-delegate-subgroup", c->delegate_subgroup); + if (r < 0) + return r; + + if (c->memory_pressure_threshold_usec != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-memory-pressure-threshold-usec", c->memory_pressure_threshold_usec); + if (r < 0) + return r; + } + + LIST_FOREACH(device_allow, a, c->device_allow) { + r = serialize_item_format(f, "exec-cgroup-context-device-allow", "%s %s", + a->path, + cgroup_device_permissions_to_string(a->permissions)); + if (r < 0) + return r; + } + + LIST_FOREACH(device_weights, iw, c->io_device_weights) { + r = serialize_item_format(f, "exec-cgroup-context-io-device-weight", "%s %" PRIu64, + iw->path, + iw->weight); + if (r < 0) + return r; + } + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) { + r = serialize_item_format(f, "exec-cgroup-context-io-device-latency-target-usec", "%s " USEC_FMT, + l->path, + l->target_usec); + if (r < 0) + return r; + } + + LIST_FOREACH(device_limits, il, c->io_device_limits) + for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) { + _cleanup_free_ char *key = NULL; + + if (il->limits[type] == cgroup_io_limit_defaults[type]) + continue; + + key = strjoin("exec-cgroup-context-io-device-limit-", + cgroup_io_limit_type_to_string(type)); + if (!key) + return -ENOMEM; + + r = serialize_item_format(f, key, "%s %" PRIu64, il->path, il->limits[type]); + if (r < 0) + return r; + } + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + r = serialize_item_format(f, "exec-cgroup-context-blockio-device-weight", "%s %" PRIu64, + w->path, + w->weight); + if (r < 0) + return r; + } + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + if (b->rbps != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-blockio-read-bandwidth", "%s %" PRIu64, + b->path, + b->rbps); + if (r < 0) + return r; + } + if (b->wbps != CGROUP_LIMIT_MAX) { + r = serialize_item_format(f, "exec-cgroup-context-blockio-write-bandwidth", "%s %" PRIu64, + b->path, + b->wbps); + if (r < 0) + return r; + } + } + + SET_FOREACH(iaai, c->ip_address_allow) { + r = serialize_item(f, + "exec-cgroup-context-ip-address-allow", + IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen)); + if (r < 0) + return r; + } + SET_FOREACH(iaai, c->ip_address_deny) { + r = serialize_item(f, + "exec-cgroup-context-ip-address-deny", + IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen)); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-cgroup-context-ip-address-allow-reduced", c->ip_address_allow_reduced); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-cgroup-context-ip-address-deny-reduced", c->ip_address_deny_reduced); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-cgroup-context-ip-ingress-filter-path=", c->ip_filters_ingress); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-cgroup-context-ip-egress-filter-path=", c->ip_filters_egress); + if (r < 0) + return r; + + LIST_FOREACH(programs, p, c->bpf_foreign_programs) { + r = serialize_item_format(f, "exec-cgroup-context-bpf-program", "%" PRIu32 " %s", + p->attach_type, + p->bpffs_path); + if (r < 0) + return r; + } + + LIST_FOREACH(socket_bind_items, bi, c->socket_bind_allow) { + fprintf(f, "exec-cgroup-context-socket-bind-allow="); + cgroup_context_dump_socket_bind_item(bi, f); + fputc('\n', f); + } + + LIST_FOREACH(socket_bind_items, bi, c->socket_bind_deny) { + fprintf(f, "exec-cgroup-context-socket-bind-deny="); + cgroup_context_dump_socket_bind_item(bi, f); + fputc('\n', f); + } + + SET_FOREACH(iface, c->restrict_network_interfaces) { + r = serialize_item(f, "exec-cgroup-context-restrict-network-interfaces", iface); + if (r < 0) + return r; + } + + r = serialize_bool_elide( + f, + "exec-cgroup-context-restrict-network-interfaces-is-allow-list", + c->restrict_network_interfaces_is_allow_list); + if (r < 0) + return r; + + fputc('\n', f); /* End marker */ + + return 0; +} + +static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { + int r; + + assert(f); + + if (!c) + return 0; + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *val; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + if ((val = startswith(l, "exec-cgroup-context-cpu-accounting="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->cpu_accounting = r; + } else if ((val = startswith(l, "exec-cgroup-context-io-accounting="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->io_accounting = r; + } else if ((val = startswith(l, "exec-cgroup-context-block-io-accounting="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->blockio_accounting = r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-accounting="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->memory_accounting = r; + } else if ((val = startswith(l, "exec-cgroup-context-tasks-accounting="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->tasks_accounting = r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-accounting="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->ip_accounting = r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-oom-group="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->memory_oom_group = r; + } else if ((val = startswith(l, "exec-cgroup-context-cpu-weight="))) { + r = safe_atou64(val, &c->cpu_weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-cpu-weight="))) { + r = safe_atou64(val, &c->startup_cpu_weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-cpu-shares="))) { + r = safe_atou64(val, &c->cpu_shares); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-cpu-shares="))) { + r = safe_atou64(val, &c->startup_cpu_shares); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-cpu-quota-per-sec-usec="))) { + r = deserialize_usec(val, &c->cpu_quota_per_sec_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-cpu-quota-period-usec="))) { + r = deserialize_usec(val, &c->cpu_quota_period_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-allowed-cpus="))) { + if (c->cpuset_cpus.set) + return -EINVAL; /* duplicated */ + + r = parse_cpu_set_full( + val, + &c->cpuset_cpus, + /* warn= */ false, + /* unit= */ NULL, + /* filename= */ NULL, + /* line= */ 0, + /* lvalue= */ NULL); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-allowed-cpus="))) { + if (c->startup_cpuset_cpus.set) + return -EINVAL; /* duplicated */ + + r = parse_cpu_set_full( + val, + &c->startup_cpuset_cpus, + /* warn= */ false, + /* unit= */ NULL, + /* filename= */ NULL, + /* line= */ 0, + /* lvalue= */ NULL); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-allowed-memory-nodes="))) { + if (c->cpuset_mems.set) + return -EINVAL; /* duplicated */ + + r = parse_cpu_set_full( + val, + &c->cpuset_mems, + /* warn= */ false, + /* unit= */ NULL, + /* filename= */ NULL, + /* line= */ 0, + /* lvalue= */ NULL); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-allowed-memory-nodes="))) { + if (c->startup_cpuset_mems.set) + return -EINVAL; /* duplicated */ + + r = parse_cpu_set_full( + val, + &c->startup_cpuset_mems, + /* warn= */ false, + /* unit= */ NULL, + /* filename= */ NULL, + /* line= */ 0, + /* lvalue= */ NULL); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-io-weight="))) { + r = safe_atou64(val, &c->io_weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-io-weight="))) { + r = safe_atou64(val, &c->startup_io_weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-block-io-weight="))) { + r = safe_atou64(val, &c->blockio_weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-block-io-weight="))) { + r = safe_atou64(val, &c->startup_blockio_weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-default-memory-min="))) { + r = safe_atou64(val, &c->default_memory_min); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-default-memory-low="))) { + r = safe_atou64(val, &c->default_memory_low); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-min="))) { + r = safe_atou64(val, &c->memory_min); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-low="))) { + r = safe_atou64(val, &c->memory_low); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-low="))) { + r = safe_atou64(val, &c->startup_memory_low); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-high="))) { + r = safe_atou64(val, &c->memory_high); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-high="))) { + r = safe_atou64(val, &c->startup_memory_high); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-max="))) { + r = safe_atou64(val, &c->memory_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-max="))) { + r = safe_atou64(val, &c->startup_memory_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-swap-max="))) { + r = safe_atou64(val, &c->memory_swap_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-swap-max="))) { + r = safe_atou64(val, &c->startup_memory_swap_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-zswap-max="))) { + r = safe_atou64(val, &c->memory_zswap_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-zswap-max="))) { + r = safe_atou64(val, &c->startup_memory_zswap_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-limit="))) { + r = safe_atou64(val, &c->memory_limit); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-tasks-max-value="))) { + r = safe_atou64(val, &c->tasks_max.value); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-tasks-max-scale="))) { + r = safe_atou64(val, &c->tasks_max.scale); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-default-memory-min-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->default_memory_min_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-default-memory-low-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->default_memory_low_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-default-startup-memory-low-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->default_startup_memory_low_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-min-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->memory_min_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-low-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->memory_low_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-low-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->startup_memory_low_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-high-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->startup_memory_high_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-max-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->startup_memory_max_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-swap-max-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->startup_memory_swap_max_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-zswap-max-set="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->startup_memory_zswap_max_set = r; + } else if ((val = startswith(l, "exec-cgroup-context-device-policy="))) { + c->device_policy = cgroup_device_policy_from_string(val); + if (c->device_policy < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-disable-controllers="))) { + r = cg_mask_from_string(val, &c->disable_controllers); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-delegate-controllers="))) { + r = cg_mask_from_string(val, &c->delegate_controllers); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-delegate="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->delegate = r; + } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-swap="))) { + c->moom_swap = managed_oom_mode_from_string(val); + if (c->moom_swap < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure="))) { + c->moom_mem_pressure = managed_oom_mode_from_string(val); + if (c->moom_mem_pressure < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure-limit="))) { + r = safe_atou32(val, &c->moom_mem_pressure_limit); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-preference="))) { + c->moom_preference = managed_oom_preference_from_string(val); + if (c->moom_preference < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) { + c->memory_pressure_watch = cgroup_pressure_watch_from_string(val); + if (c->memory_pressure_watch < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) { + r = free_and_strdup(&c->delegate_subgroup, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-threshold-usec="))) { + r = deserialize_usec(val, &c->memory_pressure_threshold_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) { + _cleanup_free_ char *path = NULL, *rwm = NULL; + CGroupDevicePermissions p; + + r = extract_many_words(&val, " ", 0, &path, &rwm, NULL); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + p = isempty(rwm) ? 0 : cgroup_device_permissions_from_string(rwm); + if (p < 0) + return p; + + r = cgroup_context_add_or_update_device_allow(c, path, p); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-io-device-weight="))) { + _cleanup_free_ char *path = NULL, *weight = NULL; + CGroupIODeviceWeight *a = NULL; + + r = extract_many_words(&val, " ", 0, &path, &weight, NULL); + if (r < 0) + return r; + if (r != 2) + return -EINVAL; + + LIST_FOREACH(device_weights, b, c->io_device_weights) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupIODeviceWeight, 1); + if (!a) + return log_oom_debug(); + + a->path = TAKE_PTR(path); + + LIST_PREPEND(device_weights, c->io_device_weights, a); + } + + r = safe_atou64(weight, &a->weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-io-device-latency-target-usec="))) { + _cleanup_free_ char *path = NULL, *target = NULL; + CGroupIODeviceLatency *a = NULL; + + r = extract_many_words(&val, " ", 0, &path, &target, NULL); + if (r < 0) + return r; + if (r != 2) + return -EINVAL; + + LIST_FOREACH(device_latencies, b, c->io_device_latencies) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupIODeviceLatency, 1); + if (!a) + return log_oom_debug(); + + a->path = TAKE_PTR(path); + + LIST_PREPEND(device_latencies, c->io_device_latencies, a); + } + + r = deserialize_usec(target, &a->target_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-io-device-limit-"))) { + _cleanup_free_ char *type = NULL, *path = NULL, *limits = NULL; + CGroupIODeviceLimit *limit = NULL; + CGroupIOLimitType t; + + r = extract_many_words(&val, "= ", 0, &type, &path, &limits, NULL); + if (r < 0) + return r; + if (r != 3) + return -EINVAL; + + t = cgroup_io_limit_type_from_string(type); + if (t < 0) + return t; + + LIST_FOREACH(device_limits, i, c->io_device_limits) + if (path_equal(path, i->path)) { + limit = i; + break; + } + + if (!limit) { + limit = new0(CGroupIODeviceLimit, 1); + if (!limit) + return log_oom_debug(); + + limit->path = TAKE_PTR(path); + for (CGroupIOLimitType i = 0; i < _CGROUP_IO_LIMIT_TYPE_MAX; i++) + limit->limits[i] = cgroup_io_limit_defaults[i]; + + LIST_PREPEND(device_limits, c->io_device_limits, limit); + } + + r = safe_atou64(limits, &limit->limits[t]); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-block-io-device-weight="))) { + _cleanup_free_ char *path = NULL, *weight = NULL; + CGroupBlockIODeviceWeight *a = NULL; + + r = extract_many_words(&val, " ", 0, &path, &weight, NULL); + if (r < 0) + return r; + if (r != 2) + return -EINVAL; + + a = new0(CGroupBlockIODeviceWeight, 1); + if (!a) + return log_oom_debug(); + + a->path = TAKE_PTR(path); + + LIST_PREPEND(device_weights, c->blockio_device_weights, a); + + r = safe_atou64(weight, &a->weight); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-block-io-read-bandwidth="))) { + _cleanup_free_ char *path = NULL, *bw = NULL; + CGroupBlockIODeviceBandwidth *a = NULL; + + r = extract_many_words(&val, " ", 0, &path, &bw, NULL); + if (r < 0) + return r; + if (r != 2) + return -EINVAL; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupBlockIODeviceBandwidth, 1); + if (!a) + return log_oom_debug(); + + a->path = TAKE_PTR(path); + a->wbps = CGROUP_LIMIT_MAX; + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a); + } + + r = safe_atou64(bw, &a->rbps); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-block-io-write-bandwidth="))) { + _cleanup_free_ char *path = NULL, *bw = NULL; + CGroupBlockIODeviceBandwidth *a = NULL; + + r = extract_many_words(&val, " ", 0, &path, &bw, NULL); + if (r < 0) + return r; + if (r != 2) + return -EINVAL; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + if (path_equal(b->path, path)) { + a = b; + break; + } + + if (!a) { + a = new0(CGroupBlockIODeviceBandwidth, 1); + if (!a) + return log_oom_debug(); + + a->path = TAKE_PTR(path); + a->rbps = CGROUP_LIMIT_MAX; + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a); + } + + r = safe_atou64(bw, &a->wbps); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-address-allow="))) { + struct in_addr_prefix a; + + r = in_addr_prefix_from_string_auto(val, &a.family, &a.address, &a.prefixlen); + if (r < 0) + return r; + + r = in_addr_prefix_add(&c->ip_address_allow, &a); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-address-deny="))) { + struct in_addr_prefix a; + + r = in_addr_prefix_from_string_auto(val, &a.family, &a.address, &a.prefixlen); + if (r < 0) + return r; + + r = in_addr_prefix_add(&c->ip_address_deny, &a); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-address-allow-reduced="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->ip_address_allow_reduced = r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-address-deny-reduced="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->ip_address_deny_reduced = r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-ingress-filter-path="))) { + r = deserialize_strv(val, &c->ip_filters_ingress); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-ip-egress-filter-path="))) { + r = deserialize_strv(val, &c->ip_filters_egress); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-bpf-program="))) { + _cleanup_free_ char *type = NULL, *path = NULL; + uint32_t t; + + r = extract_many_words(&val, " ", 0, &type, &path, NULL); + if (r < 0) + return r; + if (r != 2) + return -EINVAL; + + r = safe_atou32(type, &t); + if (r < 0) + return r; + + r = cgroup_context_add_bpf_foreign_program(c, t, path); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-socket-bind-allow="))) { + CGroupSocketBindItem *item; + uint16_t nr_ports, port_min; + int af, ip_protocol; + + r = parse_socket_bind_item(val, &af, &ip_protocol, &nr_ports, &port_min); + if (r < 0) + return r; + + item = new(CGroupSocketBindItem, 1); + if (!item) + return log_oom_debug(); + *item = (CGroupSocketBindItem) { + .address_family = af, + .ip_protocol = ip_protocol, + .nr_ports = nr_ports, + .port_min = port_min, + }; + + LIST_PREPEND(socket_bind_items, c->socket_bind_allow, item); + } else if ((val = startswith(l, "exec-cgroup-context-socket-bind-deny="))) { + CGroupSocketBindItem *item; + uint16_t nr_ports, port_min; + int af, ip_protocol; + + r = parse_socket_bind_item(val, &af, &ip_protocol, &nr_ports, &port_min); + if (r < 0) + return r; + + item = new(CGroupSocketBindItem, 1); + if (!item) + return log_oom_debug(); + *item = (CGroupSocketBindItem) { + .address_family = af, + .ip_protocol = ip_protocol, + .nr_ports = nr_ports, + .port_min = port_min, + }; + + LIST_PREPEND(socket_bind_items, c->socket_bind_deny, item); + } else if ((val = startswith(l, "exec-cgroup-context-restrict-network-interfaces="))) { + r = set_ensure_allocated(&c->restrict_network_interfaces, &string_hash_ops); + if (r < 0) + return r; + + r = set_put_strdup(&c->restrict_network_interfaces, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-restrict-network-interfaces-is-allow-list="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->restrict_network_interfaces_is_allow_list = r; + } else + log_warning("Failed to parse serialized line, ignoring: %s", l); + } + + return 0; +} + +static int exec_runtime_serialize(const ExecRuntime *rt, FILE *f, FDSet *fds) { + int r; + + assert(f); + assert(fds); + + if (!rt) { + fputc('\n', f); /* End marker */ + return 0; + } + + if (rt->shared) { + r = serialize_item(f, "exec-runtime-id", rt->shared->id); + if (r < 0) + return r; + + r = serialize_item(f, "exec-runtime-tmp-dir", rt->shared->tmp_dir); + if (r < 0) + return r; + + r = serialize_item(f, "exec-runtime-var-tmp-dir", rt->shared->var_tmp_dir); + if (r < 0) + return r; + + if (rt->shared->netns_storage_socket[0] >= 0 && rt->shared->netns_storage_socket[1] >= 0) { + r = serialize_fd_many(f, fds, "exec-runtime-netns-storage-socket", rt->shared->netns_storage_socket, 2); + if (r < 0) + return r; + } + + if (rt->shared->ipcns_storage_socket[0] >= 0 && rt->shared->ipcns_storage_socket[1] >= 0) { + r = serialize_fd_many(f, fds, "exec-runtime-ipcns-storage-socket", rt->shared->ipcns_storage_socket, 2); + if (r < 0) + return r; + } + } + + if (rt->dynamic_creds) { + r = dynamic_user_serialize_one(rt->dynamic_creds->user, "exec-runtime-dynamic-creds-user", f, fds); + if (r < 0) + return r; + } + + if (rt->dynamic_creds && rt->dynamic_creds->group && rt->dynamic_creds->group == rt->dynamic_creds->user) { + r = serialize_bool(f, "exec-runtime-dynamic-creds-group-copy", true); + if (r < 0) + return r; + } else if (rt->dynamic_creds) { + r = dynamic_user_serialize_one(rt->dynamic_creds->group, "exec-runtime-dynamic-creds-group", f, fds); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-runtime-ephemeral-copy", rt->ephemeral_copy); + if (r < 0) + return r; + + if (rt->ephemeral_storage_socket[0] >= 0 && rt->ephemeral_storage_socket[1] >= 0) { + r = serialize_fd_many(f, fds, "exec-runtime-ephemeral-storage-socket", rt->ephemeral_storage_socket, 2); + if (r < 0) + return r; + } + + fputc('\n', f); /* End marker */ + + return 0; +} + +static int exec_runtime_deserialize(ExecRuntime *rt, FILE *f, FDSet *fds) { + int r; + + assert(rt); + assert(rt->shared); + assert(rt->dynamic_creds); + assert(f); + assert(fds); + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *val; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + if ((val = startswith(l, "exec-runtime-id="))) { + r = free_and_strdup(&rt->shared->id, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-runtime-tmp-dir="))) { + r = free_and_strdup(&rt->shared->tmp_dir, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-runtime-var-tmp-dir="))) { + r = free_and_strdup(&rt->shared->var_tmp_dir, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-runtime-netns-storage-socket="))) { + + r = deserialize_fd_many(fds, val, 2, rt->shared->netns_storage_socket); + if (r < 0) + continue; + + } else if ((val = startswith(l, "exec-runtime-ipcns-storage-socket="))) { + + r = deserialize_fd_many(fds, val, 2, rt->shared->ipcns_storage_socket); + if (r < 0) + continue; + + } else if ((val = startswith(l, "exec-runtime-dynamic-creds-user="))) + dynamic_user_deserialize_one(/* m= */ NULL, val, fds, &rt->dynamic_creds->user); + else if ((val = startswith(l, "exec-runtime-dynamic-creds-group="))) + dynamic_user_deserialize_one(/* m= */ NULL, val, fds, &rt->dynamic_creds->group); + else if ((val = startswith(l, "exec-runtime-dynamic-creds-group-copy="))) { + r = parse_boolean(val); + if (r < 0) + return r; + if (!r) + continue; /* Nothing to do */ + + if (!rt->dynamic_creds->user) + return -EINVAL; + + rt->dynamic_creds->group = dynamic_user_ref(rt->dynamic_creds->user); + } else if ((val = startswith(l, "exec-runtime-ephemeral-copy="))) { + r = free_and_strdup(&rt->ephemeral_copy, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-runtime-ephemeral-storage-socket="))) { + + r = deserialize_fd_many(fds, val, 2, rt->ephemeral_storage_socket); + if (r < 0) + continue; + } else + log_warning("Failed to parse serialized line, ignoring: %s", l); + } + + return 0; +} + +static bool exec_parameters_is_idle_pipe_set(const ExecParameters *p) { + assert(p); + + return p->idle_pipe && + p->idle_pipe[0] >= 0 && + p->idle_pipe[1] >= 0 && + p->idle_pipe[2] >= 0 && + p->idle_pipe[3] >= 0; +} + +static int exec_parameters_serialize(const ExecParameters *p, const ExecContext *c, FILE *f, FDSet *fds) { + int r; + + assert(f); + assert(fds); + + if (!p) + return 0; + + r = serialize_item(f, "exec-parameters-runtime-scope", runtime_scope_to_string(p->runtime_scope)); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-parameters-environment", p->environment); + if (r < 0) + return r; + + if (p->fds) { + if (p->n_socket_fds > 0) { + r = serialize_item_format(f, "exec-parameters-n-socket-fds", "%zu", p->n_socket_fds); + if (r < 0) + return r; + } + + if (p->n_storage_fds > 0) { + r = serialize_item_format(f, "exec-parameters-n-storage-fds", "%zu", p->n_storage_fds); + if (r < 0) + return r; + } + + r = serialize_fd_many(f, fds, "exec-parameters-fds", p->fds, p->n_socket_fds + p->n_storage_fds); + if (r < 0) + return r; + } + + r = serialize_strv(f, "exec-parameters-fd-names", p->fd_names); + if (r < 0) + return r; + + if (p->flags != 0) { + r = serialize_item_format(f, "exec-parameters-flags", "%u", (unsigned) p->flags); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-parameters-selinux-context-net", p->selinux_context_net); + if (r < 0) + return r; + + if (p->cgroup_supported != 0) { + r = serialize_item_format(f, "exec-parameters-cgroup-supported", "%u", (unsigned) p->cgroup_supported); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-parameters-cgroup-path", p->cgroup_path); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-parameters-cgroup-id", "%" PRIu64, p->cgroup_id); + if (r < 0) + return r; + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + _cleanup_free_ char *key = NULL; + + key = strjoin("exec-parameters-prefix-directories-", exec_directory_type_to_string(dt)); + if (!key) + return log_oom_debug(); + + /* Always serialize, even an empty prefix, as this is a fixed array and we always expect + * to have all elements (unless fuzzing is happening, hence the NULL check). */ + r = serialize_item(f, key, strempty(p->prefix ? p->prefix[dt] : NULL)); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-parameters-received-credentials-directory", p->received_credentials_directory); + if (r < 0) + return r; + + r = serialize_item(f, "exec-parameters-received-encrypted-credentials-directory", p->received_encrypted_credentials_directory); + if (r < 0) + return r; + + r = serialize_item(f, "exec-parameters-confirm-spawn", p->confirm_spawn); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-parameters-shall-confirm-spawn", p->shall_confirm_spawn); + if (r < 0) + return r; + + if (p->watchdog_usec > 0) { + r = serialize_usec(f, "exec-parameters-watchdog-usec", p->watchdog_usec); + if (r < 0) + return r; + } + + if (exec_parameters_is_idle_pipe_set(p)) { + r = serialize_fd_many(f, fds, "exec-parameters-idle-pipe", p->idle_pipe, 4); + if (r < 0) + return r; + } + + r = serialize_fd(f, fds, "exec-parameters-stdin-fd", p->stdin_fd); + if (r < 0) + return r; + + r = serialize_fd(f, fds, "exec-parameters-stdout-fd", p->stdout_fd); + if (r < 0) + return r; + + r = serialize_fd(f, fds, "exec-parameters-stderr-fd", p->stderr_fd); + if (r < 0) + return r; + + r = serialize_fd(f, fds, "exec-parameters-exec-fd", p->exec_fd); + if (r < 0) + return r; + + if (c && exec_context_restrict_filesystems_set(c)) { + r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_outer_map_fd); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-parameters-notify-socket", p->notify_socket); + if (r < 0) + return r; + + LIST_FOREACH(open_files, file, p->open_files) { + _cleanup_free_ char *ofs = NULL; + + r = open_file_to_string(file, &ofs); + if (r < 0) + return r; + + r = serialize_item(f, "exec-parameters-open-file", ofs); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-parameters-fallback-smack-process-label", p->fallback_smack_process_label); + if (r < 0) + return r; + + r = serialize_fd(f, fds, "exec-parameters-user-lookup-fd", p->user_lookup_fd); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-parameters-files-env", p->files_env); + if (r < 0) + return r; + + r = serialize_item(f, "exec-parameters-unit-id", p->unit_id); + if (r < 0) + return r; + + r = serialize_item(f, "exec-parameters-invocation-id-string", p->invocation_id_string); + if (r < 0) + return r; + + fputc('\n', f); /* End marker */ + + return 0; +} + +static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { + int r, nr_open; + + assert(p); + assert(f); + assert(fds); + + nr_open = read_nr_open(); + if (nr_open < 3) + nr_open = HIGH_RLIMIT_NOFILE; + assert(nr_open > 0); /* For compilers/static analyzers */ + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *val; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + if ((val = startswith(l, "exec-parameters-runtime-scope="))) { + p->runtime_scope = runtime_scope_from_string(val); + if (p->runtime_scope < 0) + return p->runtime_scope; + } else if ((val = startswith(l, "exec-parameters-environment="))) { + r = deserialize_strv(val, &p->environment); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-n-socket-fds="))) { + if (p->fds) + return -EINVAL; /* Already received */ + + r = safe_atozu(val, &p->n_socket_fds); + if (r < 0) + return r; + + if (p->n_socket_fds > (size_t) nr_open) + return -EINVAL; /* too many, someone is playing games with us */ + } else if ((val = startswith(l, "exec-parameters-n-storage-fds="))) { + if (p->fds) + return -EINVAL; /* Already received */ + + r = safe_atozu(val, &p->n_storage_fds); + if (r < 0) + return r; + + if (p->n_storage_fds > (size_t) nr_open) + return -EINVAL; /* too many, someone is playing games with us */ + } else if ((val = startswith(l, "exec-parameters-fds="))) { + if (p->n_socket_fds + p->n_storage_fds == 0) + return log_warning_errno( + SYNTHETIC_ERRNO(EINVAL), + "Got exec-parameters-fds= without " + "prior exec-parameters-n-socket-fds= or exec-parameters-n-storage-fds="); + if (p->n_socket_fds + p->n_storage_fds > (size_t) nr_open) + return -EINVAL; /* too many, someone is playing games with us */ + + if (p->fds) + return -EINVAL; /* duplicated */ + + p->fds = new(int, p->n_socket_fds + p->n_storage_fds); + if (!p->fds) + return log_oom_debug(); + + /* Ensure we don't leave any FD uninitialized on error, it makes the fuzzer sad */ + for (size_t i = 0; i < p->n_socket_fds + p->n_storage_fds; ++i) + p->fds[i] = -EBADF; + + r = deserialize_fd_many(fds, val, p->n_socket_fds + p->n_storage_fds, p->fds); + if (r < 0) + continue; + + } else if ((val = startswith(l, "exec-parameters-fd-names="))) { + r = deserialize_strv(val, &p->fd_names); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-flags="))) { + unsigned flags; + + r = safe_atou(val, &flags); + if (r < 0) + return r; + p->flags = flags; + } else if ((val = startswith(l, "exec-parameters-selinux-context-net="))) { + r = parse_boolean(val); + if (r < 0) + return r; + + p->selinux_context_net = r; + } else if ((val = startswith(l, "exec-parameters-cgroup-supported="))) { + unsigned cgroup_supported; + + r = safe_atou(val, &cgroup_supported); + if (r < 0) + return r; + p->cgroup_supported = cgroup_supported; + } else if ((val = startswith(l, "exec-parameters-cgroup-path="))) { + r = free_and_strdup(&p->cgroup_path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-cgroup-id="))) { + r = safe_atou64(val, &p->cgroup_id); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-prefix-directories-"))) { + _cleanup_free_ char *type = NULL, *prefix = NULL; + ExecDirectoryType dt; + + r = extract_many_words(&val, "= ", 0, &type, &prefix, NULL); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + dt = exec_directory_type_from_string(type); + if (dt < 0) + return -EINVAL; + + if (!p->prefix) { + p->prefix = new0(char*, _EXEC_DIRECTORY_TYPE_MAX+1); + if (!p->prefix) + return log_oom_debug(); + } + + if (isempty(prefix)) + p->prefix[dt] = mfree(p->prefix[dt]); + else + free_and_replace(p->prefix[dt], prefix); + } else if ((val = startswith(l, "exec-parameters-received-credentials-directory="))) { + r = free_and_strdup(&p->received_credentials_directory, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-received-encrypted-credentials-directory="))) { + r = free_and_strdup(&p->received_encrypted_credentials_directory, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-confirm-spawn="))) { + r = free_and_strdup(&p->confirm_spawn, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-shall-confirm-spawn="))) { + r = parse_boolean(val); + if (r < 0) + return r; + + p->shall_confirm_spawn = r; + } else if ((val = startswith(l, "exec-parameters-watchdog-usec="))) { + r = deserialize_usec(val, &p->watchdog_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-idle-pipe="))) { + if (p->idle_pipe) + return -EINVAL; /* duplicated */ + + p->idle_pipe = new(int, 4); + if (!p->idle_pipe) + return log_oom_debug(); + + p->idle_pipe[0] = p->idle_pipe[1] = p->idle_pipe[2] = p->idle_pipe[3] = -EBADF; + + r = deserialize_fd_many(fds, val, 4, p->idle_pipe); + if (r < 0) + continue; + + } else if ((val = startswith(l, "exec-parameters-stdin-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + p->stdin_fd = fd; + + } else if ((val = startswith(l, "exec-parameters-stdout-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + p->stdout_fd = fd; + + } else if ((val = startswith(l, "exec-parameters-stderr-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + p->stderr_fd = fd; + } else if ((val = startswith(l, "exec-parameters-exec-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + p->exec_fd = fd; + } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + p->bpf_outer_map_fd = fd; + } else if ((val = startswith(l, "exec-parameters-notify-socket="))) { + r = free_and_strdup(&p->notify_socket, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-open-file="))) { + OpenFile *of = NULL; + + r = open_file_parse(val, &of); + if (r < 0) + return r; + + LIST_APPEND(open_files, p->open_files, of); + } else if ((val = startswith(l, "exec-parameters-fallback-smack-process-label="))) { + r = free_and_strdup(&p->fallback_smack_process_label, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-user-lookup-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + p->user_lookup_fd = fd; + } else if ((val = startswith(l, "exec-parameters-files-env="))) { + r = deserialize_strv(val, &p->files_env); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-unit-id="))) { + r = free_and_strdup(&p->unit_id, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-parameters-invocation-id-string="))) { + if (strlen(val) > SD_ID128_STRING_MAX - 1) + return -EINVAL; + + r = sd_id128_from_string(val, &p->invocation_id); + if (r < 0) + return r; + + sd_id128_to_string(p->invocation_id, p->invocation_id_string); + } else + log_warning("Failed to parse serialized line, ignoring: %s", l); + } + + /* Bail out if we got exec-parameters-n-{socket/storage}-fds= but no corresponding + * exec-parameters-fds= */ + if (p->n_socket_fds + p->n_storage_fds > 0 && !p->fds) + return -EINVAL; + + return 0; +} + +static int serialize_std_out_err(const ExecContext *c, FILE *f, int fileno) { + char *key, *value; + const char *type; + + assert(c); + assert(f); + assert(IN_SET(fileno, STDOUT_FILENO, STDERR_FILENO)); + + type = fileno == STDOUT_FILENO ? "output" : "error"; + + switch (fileno == STDOUT_FILENO ? c->std_output : c->std_error) { + case EXEC_OUTPUT_NAMED_FD: + key = strjoina("exec-context-std-", type, "-fd-name"); + value = c->stdio_fdname[fileno]; + + break; + + case EXEC_OUTPUT_FILE: + key = strjoina("exec-context-std-", type, "-file"); + value = c->stdio_file[fileno]; + + break; + + case EXEC_OUTPUT_FILE_APPEND: + key = strjoina("exec-context-std-", type, "-file-append"); + value = c->stdio_file[fileno]; + + break; + + case EXEC_OUTPUT_FILE_TRUNCATE: + key = strjoina("exec-context-std-", type, "-file-truncate"); + value = c->stdio_file[fileno]; + + break; + + default: + return 0; + } + + return serialize_item(f, key, value); +} + +static int exec_context_serialize(const ExecContext *c, FILE *f) { + int r; + + assert(f); + + if (!c) + return 0; + + r = serialize_strv(f, "exec-context-environment", c->environment); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-environment-files", c->environment_files); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-pass-environment", c->pass_environment); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-unset-environment", c->unset_environment); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-working-directory", c->working_directory); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-root-directory", c->root_directory); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-root-image", c->root_image); + if (r < 0) + return r; + + if (c->root_image_options) { + _cleanup_free_ char *options = NULL; + + LIST_FOREACH(mount_options, o, c->root_image_options) { + if (isempty(o->options)) + continue; + + _cleanup_free_ char *escaped = NULL; + escaped = shell_escape(o->options, ":"); + if (!escaped) + return log_oom_debug(); + + if (!strextend(&options, + " ", + partition_designator_to_string(o->partition_designator), + ":", + escaped)) + return log_oom_debug(); + } + + r = serialize_item(f, "exec-context-root-image-options", options); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-context-root-verity", c->root_verity); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-root-hash-path", c->root_hash_path); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-root-hash-sig-path", c->root_hash_sig_path); + if (r < 0) + return r; + + r = serialize_item_hexmem(f, "exec-context-root-hash", c->root_hash, c->root_hash_size); + if (r < 0) + return r; + + r = serialize_item_base64mem(f, "exec-context-root-hash-sig", c->root_hash_sig, c->root_hash_sig_size); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-root-ephemeral", c->root_ephemeral); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-umask", "%04o", c->umask); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-non-blocking", c->non_blocking); + if (r < 0) + return r; + + r = serialize_item_tristate(f, "exec-context-private-mounts", c->private_mounts); + if (r < 0) + return r; + + r = serialize_item_tristate(f, "exec-context-memory-ksm", c->memory_ksm); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-private-tmp", c->private_tmp); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-private-devices", c->private_devices); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-protect-kernel-tunables", c->protect_kernel_tunables); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-protect-kernel-modules", c->protect_kernel_modules); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-protect-kernel-logs", c->protect_kernel_logs); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-protect-clock", c->protect_clock); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-protect-control-groups", c->protect_control_groups); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-private-network", c->private_network); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-private-users", c->private_users); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-private-ipc", c->private_ipc); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-protect-home", protect_home_to_string(c->protect_home)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-protect-system", protect_system_to_string(c->protect_system)); + if (r < 0) + return r; + + if (c->mount_apivfs_set) { + r = serialize_bool(f, "exec-context-mount-api-vfs", c->mount_apivfs); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-context-same-pgrp", c->same_pgrp); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-cpu-sched-reset-on-fork", c->cpu_sched_reset_on_fork); + if (r < 0) + return r; + + r = serialize_bool(f, "exec-context-ignore-sigpipe", c->ignore_sigpipe); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-memory-deny-write-execute", c->memory_deny_write_execute); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-restrict-realtime", c->restrict_realtime); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-restrict-suid-sgid", c->restrict_suid_sgid); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-keyring-mode", exec_keyring_mode_to_string(c->keyring_mode)); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-protect-hostname", c->protect_hostname); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-protect-proc", protect_proc_to_string(c->protect_proc)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-proc-subset", proc_subset_to_string(c->proc_subset)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode)); + if (r < 0) + return r; + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + _cleanup_free_ char *key = NULL, *value = NULL; + + key = strjoin("exec-context-directories-", exec_directory_type_to_string(dt)); + if (!key) + return log_oom_debug(); + + if (asprintf(&value, "%04o", c->directories[dt].mode) < 0) + return log_oom_debug(); + + FOREACH_ARRAY(i, c->directories[dt].items, c->directories[dt].n_items) { + _cleanup_free_ char *path_escaped = NULL; + + path_escaped = shell_escape(i->path, ":" WHITESPACE); + if (!path_escaped) + return log_oom_debug(); + + if (!strextend(&value, " ", path_escaped)) + return log_oom_debug(); + + if (!strextend(&value, ":", yes_no(i->only_create))) + return log_oom_debug(); + + STRV_FOREACH(d, i->symlinks) { + _cleanup_free_ char *link_escaped = NULL; + + link_escaped = shell_escape(*d, ":" WHITESPACE); + if (!link_escaped) + return log_oom_debug(); + + if (!strextend(&value, ":", link_escaped)) + return log_oom_debug(); + } + } + + r = serialize_item(f, key, value); + if (r < 0) + return r; + } + + r = serialize_usec(f, "exec-context-timeout-clean-usec", c->timeout_clean_usec); + if (r < 0) + return r; + + if (c->nice_set) { + r = serialize_item_format(f, "exec-context-nice", "%i", c->nice); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-context-working-directory-missing-ok", c->working_directory_missing_ok); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-working-directory-home", c->working_directory_home); + if (r < 0) + return r; + + if (c->oom_score_adjust_set) { + r = serialize_item_format(f, "exec-context-oom-score-adjust", "%i", c->oom_score_adjust); + if (r < 0) + return r; + } + + if (c->coredump_filter_set) { + r = serialize_item_format(f, "exec-context-coredump-filter", "%"PRIx64, c->coredump_filter); + if (r < 0) + return r; + } + + for (unsigned i = 0; i < RLIM_NLIMITS; i++) { + _cleanup_free_ char *key = NULL, *limit = NULL; + + if (!c->rlimit[i]) + continue; + + key = strjoin("exec-context-limit-", rlimit_to_string(i)); + if (!key) + return log_oom_debug(); + + r = rlimit_format(c->rlimit[i], &limit); + if (r < 0) + return r; + + r = serialize_item(f, key, limit); + if (r < 0) + return r; + } + + if (c->ioprio_set) { + r = serialize_item_format(f, "exec-context-ioprio", "%d", c->ioprio); + if (r < 0) + return r; + } + + if (c->cpu_sched_set) { + _cleanup_free_ char *policy_str = NULL; + + r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-cpu-scheduling-policy", policy_str); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-cpu-scheduling-priority", "%i", c->cpu_sched_priority); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-cpu-scheduling-reset-on-fork", c->cpu_sched_reset_on_fork); + if (r < 0) + return r; + } + + if (c->cpu_set.set) { + _cleanup_free_ char *affinity = NULL; + + affinity = cpu_set_to_range_string(&c->cpu_set); + if (!affinity) + return log_oom_debug(); + + r = serialize_item(f, "exec-context-cpu-affinity", affinity); + if (r < 0) + return r; + } + + if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) { + _cleanup_free_ char *nodes = NULL; + + nodes = cpu_set_to_range_string(&c->numa_policy.nodes); + if (!nodes) + return log_oom_debug(); + + if (nodes) { + r = serialize_item(f, "exec-context-numa-mask", nodes); + if (r < 0) + return r; + } + + r = serialize_item_format(f, "exec-context-numa-policy", "%d", c->numa_policy.type); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-context-cpu-affinity-from-numa", c->cpu_affinity_from_numa); + if (r < 0) + return r; + + if (c->timer_slack_nsec != NSEC_INFINITY) { + r = serialize_item_format(f, "exec-context-timer-slack-nsec", NSEC_FMT, c->timer_slack_nsec); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-context-std-input", exec_input_to_string(c->std_input)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-std-output", exec_output_to_string(c->std_output)); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-std-error", exec_output_to_string(c->std_error)); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-stdio-as-fds", c->stdio_as_fds); + if (r < 0) + return r; + + switch (c->std_input) { + case EXEC_INPUT_NAMED_FD: + r = serialize_item(f, "exec-context-std-input-fd-name", c->stdio_fdname[STDIN_FILENO]); + if (r < 0) + return r; + break; + + case EXEC_INPUT_FILE: + r = serialize_item(f, "exec-context-std-input-file", c->stdio_file[STDIN_FILENO]); + if (r < 0) + return r; + break; + + default: + break; + } + + r = serialize_std_out_err(c, f, STDOUT_FILENO); + if (r < 0) + return r; + + r = serialize_std_out_err(c, f, STDERR_FILENO); + if (r < 0) + return r; + + r = serialize_item_base64mem(f, "exec-context-stdin-data", c->stdin_data, c->stdin_data_size); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-tty-path", c->tty_path); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-tty-reset", c->tty_reset); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-tty-vhangup", c->tty_vhangup); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-tty-vt-disallocate", c->tty_vt_disallocate); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-tty-rows", "%u", c->tty_rows); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-tty-columns", "%u", c->tty_cols); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-syslog-priority", "%i", c->syslog_priority); + if (r < 0) + return r; + + r = serialize_bool(f, "exec-context-syslog-level-prefix", c->syslog_level_prefix); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-syslog-identifier", c->syslog_identifier); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-log-level-max", "%d", c->log_level_max); + if (r < 0) + return r; + + if (c->log_ratelimit_interval_usec > 0) { + r = serialize_usec(f, "exec-context-log-ratelimit-interval-usec", c->log_ratelimit_interval_usec); + if (r < 0) + return r; + } + + if (c->log_ratelimit_burst > 0) { + r = serialize_item_format(f, "exec-context-log-ratelimit-burst", "%u", c->log_ratelimit_burst); + if (r < 0) + return r; + } + + r = serialize_string_set(f, "exec-context-log-filter-allowed-patterns", c->log_filter_allowed_patterns); + if (r < 0) + return r; + + r = serialize_string_set(f, "exec-context-log-filter-denied-patterns", c->log_filter_denied_patterns); + if (r < 0) + return r; + + FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) { + r = serialize_item(f, "exec-context-log-extra-fields", field->iov_base); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-context-log-namespace", c->log_namespace); + if (r < 0) + return r; + + if (c->secure_bits != 0) { + r = serialize_item_format(f, "exec-context-secure-bits", "%d", c->secure_bits); + if (r < 0) + return r; + } + + if (c->capability_bounding_set != CAP_MASK_UNSET) { + r = serialize_item_format(f, "exec-context-capability-bounding-set", "%" PRIu64, c->capability_bounding_set); + if (r < 0) + return r; + } + + if (c->capability_ambient_set != 0) { + r = serialize_item_format(f, "exec-context-capability-ambient-set", "%" PRIu64, c->capability_ambient_set); + if (r < 0) + return r; + } + + if (c->user) { + r = serialize_item(f, "exec-context-user", c->user); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-context-group", c->group); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-dynamic-user", c->dynamic_user); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-supplementary-groups", c->supplementary_groups); + if (r < 0) + return r; + + r = serialize_item_tristate(f, "exec-context-set-login-environment", c->set_login_environment); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-pam-name", c->pam_name); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-read-write-paths", c->read_write_paths); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-read-only-paths", c->read_only_paths); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-inaccessible-paths", c->inaccessible_paths); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-exec-paths", c->exec_paths); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-no-exec-paths", c->no_exec_paths); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-context-exec-search-path", c->exec_search_path); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-context-mount-propagation-flag", "%lu", c->mount_propagation_flag); + if (r < 0) + return r; + + FOREACH_ARRAY(mount, c->bind_mounts, c->n_bind_mounts) { + _cleanup_free_ char *src_escaped = NULL, *dst_escaped = NULL; + + src_escaped = shell_escape(mount->source, ":" WHITESPACE); + if (!src_escaped) + return log_oom_debug(); + + dst_escaped = shell_escape(mount->destination, ":" WHITESPACE); + if (!dst_escaped) + return log_oom_debug(); + + r = serialize_item_format(f, + mount->read_only ? "exec-context-bind-read-only-path" : "exec-context-bind-path", + "%s%s:%s:%s", + mount->ignore_enoent ? "-" : "", + src_escaped, + dst_escaped, + mount->recursive ? "rbind" : "norbind"); + if (r < 0) + return r; + } + + FOREACH_ARRAY(tmpfs, c->temporary_filesystems, c->n_temporary_filesystems) { + _cleanup_free_ char *escaped = NULL; + + if (!isempty(tmpfs->options)) { + escaped = shell_escape(tmpfs->options, ":"); + if (!escaped) + return log_oom_debug(); + } + + r = serialize_item_format(f, "exec-context-temporary-filesystems", "%s%s%s", + tmpfs->path, + isempty(escaped) ? "" : ":", + strempty(escaped)); + if (r < 0) + return r; + } + + r = serialize_item(f, "exec-context-utmp-id", c->utmp_id); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-utmp-mode", exec_utmp_mode_to_string(c->utmp_mode)); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-no-new-privileges", c->no_new_privileges); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-selinux-context-ignore", c->selinux_context_ignore); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-apparmor-profile-ignore", c->apparmor_profile_ignore); + if (r < 0) + return r; + + r = serialize_bool_elide(f, "exec-context-smack-process-label-ignore", c->smack_process_label_ignore); + if (r < 0) + return r; + + if (c->selinux_context) { + r = serialize_item_format(f, "exec-context-selinux-context", + "%s%s", + c->selinux_context_ignore ? "-" : "", + c->selinux_context); + if (r < 0) + return r; + } + + if (c->apparmor_profile) { + r = serialize_item_format(f, "exec-context-apparmor-profile", + "%s%s", + c->apparmor_profile_ignore ? "-" : "", + c->apparmor_profile); + if (r < 0) + return r; + } + + if (c->smack_process_label) { + r = serialize_item_format(f, "exec-context-smack-process-label", + "%s%s", + c->smack_process_label_ignore ? "-" : "", + c->smack_process_label); + if (r < 0) + return r; + } + + if (c->personality != PERSONALITY_INVALID) { + r = serialize_item(f, "exec-context-personality", personality_to_string(c->personality)); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-context-lock-personality", c->lock_personality); + if (r < 0) + return r; + +#if HAVE_SECCOMP + if (!hashmap_isempty(c->syscall_filter)) { + void *errno_num, *id; + HASHMAP_FOREACH_KEY(errno_num, id, c->syscall_filter) { + r = serialize_item_format(f, "exec-context-syscall-filter", "%d %d", PTR_TO_INT(id) - 1, PTR_TO_INT(errno_num)); + if (r < 0) + return r; + } + } + + if (!set_isempty(c->syscall_archs)) { + void *id; + SET_FOREACH(id, c->syscall_archs) { + r = serialize_item_format(f, "exec-context-syscall-archs", "%u", PTR_TO_UINT(id) - 1); + if (r < 0) + return r; + } + } + + if (c->syscall_errno > 0) { + r = serialize_item_format(f, "exec-context-syscall-errno", "%d", c->syscall_errno); + if (r < 0) + return r; + } + + r = serialize_bool_elide(f, "exec-context-syscall-allow-list", c->syscall_allow_list); + if (r < 0) + return r; + + if (!hashmap_isempty(c->syscall_log)) { + void *errno_num, *id; + HASHMAP_FOREACH_KEY(errno_num, id, c->syscall_log) { + r = serialize_item_format(f, "exec-context-syscall-log", "%d %d", PTR_TO_INT(id) - 1, PTR_TO_INT(errno_num)); + if (r < 0) + return r; + } + } + + r = serialize_bool_elide(f, "exec-context-syscall-log-allow-list", c->syscall_log_allow_list); + if (r < 0) + return r; +#endif + + if (c->restrict_namespaces != NAMESPACE_FLAGS_INITIAL) { + r = serialize_item_format(f, "exec-context-restrict-namespaces", "%lu", c->restrict_namespaces); + if (r < 0) + return r; + } + +#if HAVE_LIBBPF + if (exec_context_restrict_filesystems_set(c)) { + char *fs; + SET_FOREACH(fs, c->restrict_filesystems) { + r = serialize_item(f, "exec-context-restrict-filesystems", fs); + if (r < 0) + return r; + } + } + + r = serialize_bool_elide(f, "exec-context-restrict-filesystems-allow-list", c->restrict_filesystems_allow_list); + if (r < 0) + return r; +#endif + + if (!set_isempty(c->address_families)) { + void *afp; + + SET_FOREACH(afp, c->address_families) { + int af = PTR_TO_INT(afp); + + if (af <= 0 || af >= af_max()) + continue; + + r = serialize_item_format(f, "exec-context-address-families", "%d", af); + if (r < 0) + return r; + } + } + + r = serialize_bool_elide(f, "exec-context-address-families-allow-list", c->address_families_allow_list); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-network-namespace-path", c->network_namespace_path); + if (r < 0) + return r; + + r = serialize_item(f, "exec-context-ipc-namespace-path", c->ipc_namespace_path); + if (r < 0) + return r; + + FOREACH_ARRAY(mount, c->mount_images, c->n_mount_images) { + _cleanup_free_ char *s = NULL, *source_escaped = NULL, *dest_escaped = NULL; + + source_escaped = shell_escape(mount->source, WHITESPACE); + if (!source_escaped) + return log_oom_debug(); + + dest_escaped = shell_escape(mount->destination, WHITESPACE); + if (!dest_escaped) + return log_oom_debug(); + + s = strjoin(mount->ignore_enoent ? "-" : "", + source_escaped, + " ", + dest_escaped); + if (!s) + return log_oom_debug(); + + LIST_FOREACH(mount_options, o, mount->mount_options) { + _cleanup_free_ char *escaped = NULL; + + if (isempty(o->options)) + continue; + + escaped = shell_escape(o->options, ":"); + if (!escaped) + return log_oom_debug(); + + if (!strextend(&s, + " ", + partition_designator_to_string(o->partition_designator), + ":", + escaped)) + return log_oom_debug(); + } + + r = serialize_item(f, "exec-context-mount-image", s); + if (r < 0) + return r; + } + + FOREACH_ARRAY(mount, c->extension_images, c->n_extension_images) { + _cleanup_free_ char *s = NULL, *source_escaped = NULL; + + source_escaped = shell_escape(mount->source, ":" WHITESPACE); + if (!source_escaped) + return log_oom_debug(); + + s = strjoin(mount->ignore_enoent ? "-" : "", + source_escaped); + if (!s) + return log_oom_debug(); + + LIST_FOREACH(mount_options, o, mount->mount_options) { + _cleanup_free_ char *escaped = NULL; + + if (isempty(o->options)) + continue; + + escaped = shell_escape(o->options, ":"); + if (!escaped) + return log_oom_debug(); + + if (!strextend(&s, + " ", + partition_designator_to_string(o->partition_designator), + ":", + escaped)) + return log_oom_debug(); + } + + r = serialize_item(f, "exec-context-extension-image", s); + if (r < 0) + return r; + } + + r = serialize_strv(f, "exec-context-extension-directories", c->extension_directories); + if (r < 0) + return r; + + ExecSetCredential *sc; + HASHMAP_FOREACH(sc, c->set_credentials) { + _cleanup_free_ char *data = NULL; + + if (base64mem(sc->data, sc->size, &data) < 0) + return log_oom_debug(); + + r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, yes_no(sc->encrypted), data); + if (r < 0) + return r; + } + + ExecLoadCredential *lc; + HASHMAP_FOREACH(lc, c->load_credentials) { + r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, yes_no(lc->encrypted), lc->path); + if (r < 0) + return r; + } + + if (!set_isempty(c->import_credentials)) { + char *ic; + SET_FOREACH(ic, c->import_credentials) { + r = serialize_item(f, "exec-context-import-credentials", ic); + if (r < 0) + return r; + } + } + + r = serialize_image_policy(f, "exec-context-root-image-policy", c->root_image_policy); + if (r < 0) + return r; + + r = serialize_image_policy(f, "exec-context-mount-image-policy", c->mount_image_policy); + if (r < 0) + return r; + + r = serialize_image_policy(f, "exec-context-extension-image-policy", c->extension_image_policy); + if (r < 0) + return r; + + fputc('\n', f); /* End marker */ + + return 0; +} + +static int exec_context_deserialize(ExecContext *c, FILE *f) { + int r; + + assert(f); + + if (!c) + return 0; + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *val; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + if ((val = startswith(l, "exec-context-environment="))) { + r = deserialize_strv(val, &c->environment); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-environment-files="))) { + r = deserialize_strv(val, &c->environment_files); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-pass-environment="))) { + r = deserialize_strv(val, &c->pass_environment); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-unset-environment="))) { + r = deserialize_strv(val, &c->unset_environment); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-working-directory="))) { + r = free_and_strdup(&c->working_directory, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-directory="))) { + r = free_and_strdup(&c->root_directory, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-image="))) { + r = free_and_strdup(&c->root_image, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-image-options="))) { + for (;;) { + _cleanup_free_ char *word = NULL, *mount_options = NULL, *partition = NULL; + PartitionDesignator partition_designator; + MountOptions *o = NULL; + const char *p; + + r = extract_first_word(&val, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + p = word; + r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) + return -EINVAL; + + o = new(MountOptions, 1); + if (!o) + return log_oom_debug(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(mount_options), + }; + LIST_APPEND(mount_options, c->root_image_options, o); + } + } else if ((val = startswith(l, "exec-context-root-verity="))) { + r = free_and_strdup(&c->root_verity, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-hash-path="))) { + r = free_and_strdup(&c->root_hash_path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-hash-sig-path="))) { + r = free_and_strdup(&c->root_hash_sig_path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-hash="))) { + c->root_hash = mfree(c->root_hash); + r = unhexmem(val, strlen(val), &c->root_hash, &c->root_hash_size); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-hash-sig="))) { + c->root_hash_sig = mfree(c->root_hash_sig); + r= unbase64mem(val, strlen(val), &c->root_hash_sig, &c->root_hash_sig_size); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-ephemeral="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->root_ephemeral = r; + } else if ((val = startswith(l, "exec-context-umask="))) { + r = parse_mode(val, &c->umask); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-private-non-blocking="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->non_blocking = r; + } else if ((val = startswith(l, "exec-context-private-mounts="))) { + r = safe_atoi(val, &c->private_mounts); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-memory-ksm="))) { + r = safe_atoi(val, &c->memory_ksm); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-private-tmp="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->private_tmp = r; + } else if ((val = startswith(l, "exec-context-private-devices="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->private_devices = r; + } else if ((val = startswith(l, "exec-context-protect-kernel-tunables="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->protect_kernel_tunables = r; + } else if ((val = startswith(l, "exec-context-protect-kernel-modules="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->protect_kernel_modules = r; + } else if ((val = startswith(l, "exec-context-protect-kernel-logs="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->protect_kernel_logs = r; + } else if ((val = startswith(l, "exec-context-protect-clock="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->protect_clock = r; + } else if ((val = startswith(l, "exec-context-protect-control-groups="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->protect_control_groups = r; + } else if ((val = startswith(l, "exec-context-private-network="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->private_network = r; + } else if ((val = startswith(l, "exec-context-private-users="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->private_users = r; + } else if ((val = startswith(l, "exec-context-private-ipc="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->private_ipc = r; + } else if ((val = startswith(l, "exec-context-remove-ipc="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->remove_ipc = r; + } else if ((val = startswith(l, "exec-context-protect-home="))) { + c->protect_home = protect_home_from_string(val); + if (c->protect_home < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-protect-system="))) { + c->protect_system = protect_system_from_string(val); + if (c->protect_system < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->mount_apivfs = r; + c->mount_apivfs_set = true; + } else if ((val = startswith(l, "exec-context-same-pgrp="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->same_pgrp = r; + } else if ((val = startswith(l, "exec-context-cpu-sched-reset-on-fork="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->cpu_sched_reset_on_fork = r; + } else if ((val = startswith(l, "exec-context-non-blocking="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->non_blocking = r; + } else if ((val = startswith(l, "exec-context-ignore-sigpipe="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->ignore_sigpipe = r; + } else if ((val = startswith(l, "exec-context-memory-deny-write-execute="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->memory_deny_write_execute = r; + } else if ((val = startswith(l, "exec-context-restrict-realtime="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->restrict_realtime = r; + } else if ((val = startswith(l, "exec-context-restrict-suid-sgid="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->restrict_suid_sgid = r; + } else if ((val = startswith(l, "exec-context-keyring-mode="))) { + c->keyring_mode = exec_keyring_mode_from_string(val); + if (c->keyring_mode < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-protect-hostname="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->protect_hostname = r; + } else if ((val = startswith(l, "exec-context-protect-proc="))) { + c->protect_proc = protect_proc_from_string(val); + if (c->protect_proc < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-proc-subset="))) { + c->proc_subset = proc_subset_from_string(val); + if (c->proc_subset < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) { + c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val); + if (c->runtime_directory_preserve_mode < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-directories-"))) { + _cleanup_free_ char *type = NULL, *mode = NULL; + ExecDirectoryType dt; + + r = extract_many_words(&val, "= ", 0, &type, &mode, NULL); + if (r < 0) + return r; + if (r == 0 || !mode) + return -EINVAL; + + dt = exec_directory_type_from_string(type); + if (dt < 0) + return -EINVAL; + + r = parse_mode(mode, &c->directories[dt].mode); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *tuple = NULL, *path = NULL, *only_create = NULL; + const char *p; + + /* Use EXTRACT_UNESCAPE_RELAX here, as we unescape the colons in subsequent calls */ + r = extract_first_word(&val, &tuple, WHITESPACE, EXTRACT_UNESCAPE_SEPARATORS|EXTRACT_UNESCAPE_RELAX); + if (r < 0) + return r; + if (r == 0) + break; + + p = tuple; + r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create, NULL); + if (r < 0) + return r; + if (r < 2) + continue; + + r = exec_directory_add(&c->directories[dt], path, NULL); + if (r < 0) + return r; + + r = parse_boolean(only_create); + if (r < 0) + return r; + c->directories[dt].items[c->directories[dt].n_items - 1].only_create = r; + + if (isempty(p)) + continue; + + for (;;) { + _cleanup_free_ char *link = NULL; + + r = extract_first_word(&p, &link, ":", EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + r = strv_consume(&c->directories[dt].items[c->directories[dt].n_items - 1].symlinks, TAKE_PTR(link)); + if (r < 0) + return r; + } + } + } else if ((val = startswith(l, "exec-context-timeout-clean-usec="))) { + r = deserialize_usec(val, &c->timeout_clean_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-nice="))) { + r = safe_atoi(val, &c->nice); + if (r < 0) + return r; + c->nice_set = true; + } else if ((val = startswith(l, "exec-context-working-directory-missing-ok="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->working_directory_missing_ok = r; + } else if ((val = startswith(l, "exec-context-working-directory-home="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->working_directory_home = r; + } else if ((val = startswith(l, "exec-context-oom-score-adjust="))) { + r = safe_atoi(val, &c->oom_score_adjust); + if (r < 0) + return r; + c->oom_score_adjust_set = true; + } else if ((val = startswith(l, "exec-context-coredump-filter="))) { + r = safe_atoux64(val, &c->coredump_filter); + if (r < 0) + return r; + c->coredump_filter_set = true; + } else if ((val = startswith(l, "exec-context-limit-"))) { + _cleanup_free_ struct rlimit *rlimit = NULL; + _cleanup_free_ char *limit = NULL; + int type; + + r = extract_first_word(&val, &limit, "=", 0); + if (r < 0) + return r; + if (r == 0 || !val) + return -EINVAL; + + type = rlimit_from_string(limit); + if (type < 0) + return -EINVAL; + + if (!c->rlimit[type]) { + rlimit = new0(struct rlimit, 1); + if (!rlimit) + return log_oom_debug(); + + r = rlimit_parse(type, val, rlimit); + if (r < 0) + return r; + + c->rlimit[type] = TAKE_PTR(rlimit); + } else { + r = rlimit_parse(type, val, c->rlimit[type]); + if (r < 0) + return r; + } + } else if ((val = startswith(l, "exec-context-ioprio="))) { + r = safe_atoi(val, &c->ioprio); + if (r < 0) + return r; + c->ioprio_set = true; + } else if ((val = startswith(l, "exec-context-cpu-scheduling-policy="))) { + c->cpu_sched_policy = sched_policy_from_string(val); + if (c->cpu_sched_policy < 0) + return -EINVAL; + c->cpu_sched_set = true; + } else if ((val = startswith(l, "exec-context-cpu-scheduling-priority="))) { + r = safe_atoi(val, &c->cpu_sched_priority); + if (r < 0) + return r; + c->cpu_sched_set = true; + } else if ((val = startswith(l, "exec-context-cpu-scheduling-reset-on-fork="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->cpu_sched_reset_on_fork = r; + c->cpu_sched_set = true; + } else if ((val = startswith(l, "exec-context-cpu-affinity="))) { + if (c->cpu_set.set) + return -EINVAL; /* duplicated */ + + r = parse_cpu_set(val, &c->cpu_set); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-numa-mask="))) { + if (c->numa_policy.nodes.set) + return -EINVAL; /* duplicated */ + + r = parse_cpu_set(val, &c->numa_policy.nodes); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-numa-policy="))) { + r = safe_atoi(val, &c->numa_policy.type); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-cpu-affinity-from-numa="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->cpu_affinity_from_numa = r; + } else if ((val = startswith(l, "exec-context-timer-slack-nsec="))) { + r = deserialize_usec(val, (usec_t *)&c->timer_slack_nsec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-input="))) { + c->std_input = exec_input_from_string(val); + if (c->std_input < 0) + return c->std_input; + } else if ((val = startswith(l, "exec-context-std-output="))) { + c->std_output = exec_output_from_string(val); + if (c->std_output < 0) + return c->std_output; + } else if ((val = startswith(l, "exec-context-std-error="))) { + c->std_error = exec_output_from_string(val); + if (c->std_error < 0) + return c->std_error; + } else if ((val = startswith(l, "exec-context-stdio-as-fds="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->stdio_as_fds = r; + } else if ((val = startswith(l, "exec-context-std-input-fd-name="))) { + r = free_and_strdup(&c->stdio_fdname[STDIN_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-output-fd-name="))) { + r = free_and_strdup(&c->stdio_fdname[STDOUT_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-error-fd-name="))) { + r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-input-file="))) { + r = free_and_strdup(&c->stdio_file[STDIN_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-output-file="))) { + r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-output-file-append="))) { + r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-output-file-truncate="))) { + r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-error-file="))) { + r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-error-file-append="))) { + r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-std-error-file-truncate="))) { + r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-stdin-data="))) { + if (c->stdin_data) + return -EINVAL; /* duplicated */ + + r = unbase64mem(val, strlen(val), &c->stdin_data, &c->stdin_data_size); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-tty-path="))) { + r = free_and_strdup(&c->tty_path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-tty-reset="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->tty_reset = r; + } else if ((val = startswith(l, "exec-context-tty-vhangup="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->tty_vhangup = r; + } else if ((val = startswith(l, "exec-context-tty-vt-disallocate="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->tty_vt_disallocate = r; + } else if ((val = startswith(l, "exec-context-tty-rows="))) { + r = safe_atou(val, &c->tty_rows); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-tty-columns="))) { + r = safe_atou(val, &c->tty_cols); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-syslog-priority="))) { + r = safe_atoi(val, &c->syslog_priority); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-syslog-level-prefix="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->syslog_level_prefix = r; + } else if ((val = startswith(l, "exec-context-syslog-identifier="))) { + r = free_and_strdup(&c->syslog_identifier, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-log-level-max="))) { + r = safe_atoi(val, &c->log_level_max); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-log-ratelimit-interval-usec="))) { + r = deserialize_usec(val, &c->log_ratelimit_interval_usec); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-log-ratelimit-burst="))) { + r = safe_atou(val, &c->log_ratelimit_burst); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-log-filter-allowed-patterns="))) { + r = set_put_strdup(&c->log_filter_allowed_patterns, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-log-filter-denied-patterns="))) { + r = set_put_strdup(&c->log_filter_denied_patterns, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-log-extra-fields="))) { + if (!GREEDY_REALLOC(c->log_extra_fields, c->n_log_extra_fields + 1)) + return log_oom_debug(); + + c->log_extra_fields[c->n_log_extra_fields++].iov_base = strdup(val); + if (!c->log_extra_fields[c->n_log_extra_fields-1].iov_base) + return log_oom_debug(); + } else if ((val = startswith(l, "exec-context-log-namespace="))) { + r = free_and_strdup(&c->log_namespace, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-secure-bits="))) { + r = safe_atoi(val, &c->secure_bits); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-capability-bounding-set="))) { + r = safe_atou64(val, &c->capability_bounding_set); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-capability-ambient-set="))) { + r = safe_atou64(val, &c->capability_ambient_set); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-user="))) { + r = free_and_strdup(&c->user, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-group="))) { + r = free_and_strdup(&c->group, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-dynamic-user="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->dynamic_user = r; + } else if ((val = startswith(l, "exec-context-supplementary-groups="))) { + r = deserialize_strv(val, &c->supplementary_groups); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-set-login-environment="))) { + r = safe_atoi(val, &c->set_login_environment); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-pam-name="))) { + r = free_and_strdup(&c->pam_name, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-read-write-paths="))) { + r = deserialize_strv(val, &c->read_write_paths); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-read-only-paths="))) { + r = deserialize_strv(val, &c->read_only_paths); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-inaccessible-paths="))) { + r = deserialize_strv(val, &c->inaccessible_paths); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-exec-paths="))) { + r = deserialize_strv(val, &c->exec_paths); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-no-exec-paths="))) { + r = deserialize_strv(val, &c->no_exec_paths); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-exec-search-path="))) { + r = deserialize_strv(val, &c->exec_search_path); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-mount-propagation-flag="))) { + r = safe_atolu(val, &c->mount_propagation_flag); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-bind-read-only-path="))) { + _cleanup_free_ char *source = NULL, *destination = NULL; + bool rbind = true, ignore_enoent = false; + char *s = NULL, *d = NULL; + + r = extract_first_word(&val, + &source, + ":" WHITESPACE, + EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + s = source; + if (s[0] == '-') { + ignore_enoent = true; + s++; + } + + if (val && val[-1] == ':') { + r = extract_first_word(&val, + &destination, + ":" WHITESPACE, + EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + continue; + + d = destination; + + if (val && val[-1] == ':') { + _cleanup_free_ char *options = NULL; + + r = extract_first_word(&val, &options, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return -r; + + if (isempty(options) || streq(options, "rbind")) + rbind = true; + else if (streq(options, "norbind")) + rbind = false; + else + continue; + } + } else + d = s; + + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = s, + .destination = d, + .read_only = true, + .recursive = rbind, + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return log_oom_debug(); + } else if ((val = startswith(l, "exec-context-bind-path="))) { + _cleanup_free_ char *source = NULL, *destination = NULL; + bool rbind = true, ignore_enoent = false; + char *s = NULL, *d = NULL; + + r = extract_first_word(&val, + &source, + ":" WHITESPACE, + EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + s = source; + if (s[0] == '-') { + ignore_enoent = true; + s++; + } + + if (val && val[-1] == ':') { + r = extract_first_word(&val, + &destination, + ":" WHITESPACE, + EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + continue; + + d = destination; + + if (val && val[-1] == ':') { + _cleanup_free_ char *options = NULL; + + r = extract_first_word(&val, &options, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return -r; + + if (isempty(options) || streq(options, "rbind")) + rbind = true; + else if (streq(options, "norbind")) + rbind = false; + else + continue; + } + } else + d = s; + + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = s, + .destination = d, + .read_only = false, + .recursive = rbind, + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return log_oom_debug(); + } else if ((val = startswith(l, "exec-context-temporary-filesystems="))) { + _cleanup_free_ char *path = NULL, *options = NULL; + + r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options, NULL); + if (r < 0) + return r; + if (r < 1) + continue; + + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options); + if (r < 0) + return log_oom_debug(); + } else if ((val = startswith(l, "exec-context-utmp-id="))) { + r = free_and_strdup(&c->utmp_id, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-utmp-mode="))) { + c->utmp_mode = exec_utmp_mode_from_string(val); + if (c->utmp_mode < 0) + return c->utmp_mode; + } else if ((val = startswith(l, "exec-context-no-new-privileges="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->no_new_privileges = r; + } else if ((val = startswith(l, "exec-context-selinux-context-ignore="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->selinux_context_ignore = r; + } else if ((val = startswith(l, "exec-context-apparmor-profile-ignore="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->apparmor_profile_ignore = r; + } else if ((val = startswith(l, "exec-context-smack-process-label-ignore="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->smack_process_label_ignore = r; + } else if ((val = startswith(l, "exec-context-selinux-context="))) { + if (val[0] == '-') { + c->selinux_context_ignore = true; + val++; + } + + r = free_and_strdup(&c->selinux_context, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-apparmor-profile="))) { + if (val[0] == '-') { + c->apparmor_profile_ignore = true; + val++; + } + + r = free_and_strdup(&c->apparmor_profile, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-smack-process-label="))) { + if (val[0] == '-') { + c->smack_process_label_ignore = true; + val++; + } + + r = free_and_strdup(&c->smack_process_label, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-personality="))) { + c->personality = personality_from_string(val); + if (c->personality == PERSONALITY_INVALID) + return -EINVAL; + } else if ((val = startswith(l, "exec-context-lock-personality="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->lock_personality = r; +#if HAVE_SECCOMP + } else if ((val = startswith(l, "exec-context-syscall-filter="))) { + _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL; + int id, errno_num; + + r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num, NULL); + if (r < 0) + return r; + if (r != 2) + continue; + + r = safe_atoi(s_id, &id); + if (r < 0) + return r; + + r = safe_atoi(s_errno_num, &errno_num); + if (r < 0) + return r; + + r = hashmap_ensure_put(&c->syscall_filter, NULL, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-syscall-archs="))) { + unsigned int id; + + r = safe_atou(val, &id); + if (r < 0) + return r; + + r = set_ensure_put(&c->syscall_archs, NULL, UINT_TO_PTR(id + 1)); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-syscall-errno="))) { + r = safe_atoi(val, &c->syscall_errno); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-syscall-allow-list="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->syscall_allow_list = r; + } else if ((val = startswith(l, "exec-context-syscall-log="))) { + _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL; + int id, errno_num; + + r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num, NULL); + if (r < 0) + return r; + if (r != 2) + continue; + + r = safe_atoi(s_id, &id); + if (r < 0) + return r; + + r = safe_atoi(s_errno_num, &errno_num); + if (r < 0) + return r; + + r = hashmap_ensure_put(&c->syscall_log, NULL, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-syscall-log-allow-list="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->syscall_log_allow_list = r; +#endif + } else if ((val = startswith(l, "exec-context-restrict-namespaces="))) { + r = safe_atolu(val, &c->restrict_namespaces); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-restrict-filesystems="))) { + r = set_ensure_allocated(&c->restrict_filesystems, &string_hash_ops); + if (r < 0) + return r; + + r = set_put_strdup(&c->restrict_filesystems, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-restrict-filesystems-allow-list="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->restrict_filesystems_allow_list = r; + } else if ((val = startswith(l, "exec-context-address-families="))) { + int af; + + r = safe_atoi(val, &af); + if (r < 0) + return r; + + r = set_ensure_put(&c->address_families, NULL, INT_TO_PTR(af)); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-address-families-allow-list="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->address_families_allow_list = r; + } else if ((val = startswith(l, "exec-context-network-namespace-path="))) { + r = free_and_strdup(&c->network_namespace_path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-ipc-namespace-path="))) { + r = free_and_strdup(&c->ipc_namespace_path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-mount-image="))) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *source = NULL, *destination = NULL; + bool permissive = false; + char *s; + + r = extract_many_words(&val, + NULL, + EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, + &source, + &destination, + NULL); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + s = source; + if (s[0] == '-') { + permissive = true; + s++; + } + + if (isempty(destination)) + continue; + + for (;;) { + _cleanup_free_ char *tuple = NULL, *partition = NULL, *opts = NULL; + PartitionDesignator partition_designator; + MountOptions *o = NULL; + const char *p; + + r = extract_first_word(&val, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + p = tuple; + r = extract_many_words(&p, + ":", + EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, + &partition, + &opts, + NULL); + if (r < 0) + return r; + if (r == 0) + continue; + if (r == 1) { + o = new(MountOptions, 1); + if (!o) + return log_oom_debug(); + *o = (MountOptions) { + .partition_designator = PARTITION_ROOT, + .options = TAKE_PTR(partition), + }; + LIST_APPEND(mount_options, options, o); + + continue; + } + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) + continue; + + o = new(MountOptions, 1); + if (!o) + return log_oom_debug(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(opts), + }; + LIST_APPEND(mount_options, options, o); + } + + r = mount_image_add(&c->mount_images, &c->n_mount_images, + &(MountImage) { + .source = s, + .destination = destination, + .mount_options = options, + .ignore_enoent = permissive, + .type = MOUNT_IMAGE_DISCRETE, + }); + if (r < 0) + return log_oom_debug(); + } else if ((val = startswith(l, "exec-context-extension-image="))) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *source = NULL; + bool permissive = false; + char *s; + + r = extract_first_word(&val, + &source, + NULL, + EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + s = source; + if (s[0] == '-') { + permissive = true; + s++; + } + + for (;;) { + _cleanup_free_ char *tuple = NULL, *partition = NULL, *opts = NULL; + PartitionDesignator partition_designator; + MountOptions *o = NULL; + const char *p; + + r = extract_first_word(&val, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + p = tuple; + r = extract_many_words(&p, + ":", + EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, + &partition, + &opts, + NULL); + if (r < 0) + return r; + if (r == 0) + continue; + if (r == 1) { + o = new(MountOptions, 1); + if (!o) + return log_oom_debug(); + *o = (MountOptions) { + .partition_designator = PARTITION_ROOT, + .options = TAKE_PTR(partition), + }; + LIST_APPEND(mount_options, options, o); + + continue; + } + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) + continue; + + o = new(MountOptions, 1); + if (!o) + return log_oom_debug(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(opts), + }; + LIST_APPEND(mount_options, options, o); + } + + r = mount_image_add(&c->extension_images, &c->n_extension_images, + &(MountImage) { + .source = s, + .mount_options = options, + .ignore_enoent = permissive, + .type = MOUNT_IMAGE_EXTENSION, + }); + if (r < 0) + return log_oom_debug(); + } else if ((val = startswith(l, "exec-context-extension-directories="))) { + r = deserialize_strv(val, &c->extension_directories); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-set-credentials="))) { + _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL; + _cleanup_free_ char *id = NULL, *encrypted = NULL, *data = NULL; + + r = extract_many_words(&val, " ", 0, &id, &encrypted, &data, NULL); + if (r < 0) + return r; + if (r != 3) + return -EINVAL; + + r = parse_boolean(encrypted); + if (r < 0) + return r; + + sc = new(ExecSetCredential, 1); + if (!sc) + return -ENOMEM; + + *sc = (ExecSetCredential) { + .id = TAKE_PTR(id), + .encrypted = r, + }; + + r = unbase64mem(data, strlen(data), &sc->data, &sc->size); + if (r < 0) + return r; + + r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc); + if (r < 0) + return r; + + TAKE_PTR(sc); + } else if ((val = startswith(l, "exec-context-load-credentials="))) { + _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL; + _cleanup_free_ char *id = NULL, *encrypted = NULL, *path = NULL; + + r = extract_many_words(&val, " ", 0, &id, &encrypted, &path, NULL); + if (r < 0) + return r; + if (r != 3) + return -EINVAL; + + r = parse_boolean(encrypted); + if (r < 0) + return r; + + lc = new(ExecLoadCredential, 1); + if (!lc) + return -ENOMEM; + + *lc = (ExecLoadCredential) { + .id = TAKE_PTR(id), + .path = TAKE_PTR(path), + .encrypted = r, + }; + + r = hashmap_ensure_put(&c->load_credentials, &exec_load_credential_hash_ops, lc->id, lc); + if (r < 0) + return r; + + TAKE_PTR(lc); + } else if ((val = startswith(l, "exec-context-import-credentials="))) { + r = set_ensure_allocated(&c->import_credentials, &string_hash_ops); + if (r < 0) + return r; + + r = set_put_strdup(&c->import_credentials, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-root-image-policy="))) { + if (c->root_image_policy) + return -EINVAL; /* duplicated */ + + r = image_policy_from_string(val, &c->root_image_policy); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-mount-image-policy="))) { + if (c->mount_image_policy) + return -EINVAL; /* duplicated */ + + r = image_policy_from_string(val, &c->mount_image_policy); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-context-extension-image-policy="))) { + if (c->extension_image_policy) + return -EINVAL; /* duplicated */ + + r = image_policy_from_string(val, &c->extension_image_policy); + if (r < 0) + return r; + } else + log_warning("Failed to parse serialized line, ignoring: %s", l); + } + + return 0; +} + +static int exec_command_serialize(const ExecCommand *c, FILE *f) { + int r; + + assert(c); + assert(f); + + r = serialize_item(f, "exec-command-path", c->path); + if (r < 0) + return r; + + r = serialize_strv(f, "exec-command-argv", c->argv); + if (r < 0) + return r; + + r = serialize_item_format(f, "exec-command-flags", "%d", (int) c->flags); + if (r < 0) + return r; + + fputc('\n', f); /* End marker */ + + return 0; +} + +static int exec_command_deserialize(ExecCommand *c, FILE *f) { + int r; + + assert(c); + assert(f); + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *val; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + if ((val = startswith(l, "exec-command-path="))) { + r = free_and_strdup(&c->path, val); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-command-argv="))) { + r = deserialize_strv(val, &c->argv); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-command-flags="))) { + r = safe_atoi(val, &c->flags); + if (r < 0) + return r; + } else + log_warning("Failed to parse serialized line, ignoring: %s", l); + + } + + return 0; +} + +int exec_serialize_invocation( + FILE *f, + FDSet *fds, + const ExecContext *ctx, + const ExecCommand *cmd, + const ExecParameters *p, + const ExecRuntime *rt, + const CGroupContext *cg) { + + int r; + + assert(f); + assert(fds); + + r = exec_context_serialize(ctx, f); + if (r < 0) + return log_debug_errno(r, "Failed to serialize context: %m"); + + r = exec_command_serialize(cmd, f); + if (r < 0) + return log_debug_errno(r, "Failed to serialize command: %m"); + + r = exec_parameters_serialize(p, ctx, f, fds); + if (r < 0) + return log_debug_errno(r, "Failed to serialize parameters: %m"); + + r = exec_runtime_serialize(rt, f, fds); + if (r < 0) + return log_debug_errno(r, "Failed to serialize runtime: %m"); + + r = exec_cgroup_context_serialize(cg, f); + if (r < 0) + return log_debug_errno(r, "Failed to serialize cgroup context: %m"); + + return 0; +} + +int exec_deserialize_invocation( + FILE *f, + FDSet *fds, + ExecContext *ctx, + ExecCommand *cmd, + ExecParameters *p, + ExecRuntime *rt, + CGroupContext *cg) { + + int r; + + assert(f); + assert(fds); + + r = exec_context_deserialize(ctx, f); + if (r < 0) + return log_debug_errno(r, "Failed to deserialize context: %m"); + + r = exec_command_deserialize(cmd, f); + if (r < 0) + return log_debug_errno(r, "Failed to deserialize command: %m"); + + r = exec_parameters_deserialize(p, f, fds); + if (r < 0) + return log_debug_errno(r, "Failed to deserialize parameters: %m"); + + r = exec_runtime_deserialize(rt, f, fds); + if (r < 0) + return log_debug_errno(r, "Failed to deserialize runtime: %m"); + + r = exec_cgroup_context_deserialize(cg, f); + if (r < 0) + return log_debug_errno(r, "Failed to deserialize cgroup context: %m"); + + return 0; +} diff --git a/src/core/execute-serialize.h b/src/core/execute-serialize.h new file mode 100644 index 0000000..89c8e09 --- /dev/null +++ b/src/core/execute-serialize.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "execute.h" + +/* These functions serialize/deserialize for invocation purposes (i.e.: serialized object is passed to a + * child process) rather than to save state across reload/reexec. */ + +int exec_serialize_invocation(FILE *f, + FDSet *fds, + const ExecContext *ctx, + const ExecCommand *cmd, + const ExecParameters *p, + const ExecRuntime *rt, + const CGroupContext *cg); + +int exec_deserialize_invocation(FILE *f, + FDSet *fds, + ExecContext *ctx, + ExecCommand *cmd, + ExecParameters *p, + ExecRuntime *rt, + CGroupContext *cg); diff --git a/src/core/execute.c b/src/core/execute.c new file mode 100644 index 0000000..8dbdfcf --- /dev/null +++ b/src/core/execute.c @@ -0,0 +1,2742 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* Must be included after */ + +#include "sd-messages.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "async.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "constants.h" +#include "cpu-set-util.h" +#include "dev-setup.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "exec-credential.h" +#include "execute.h" +#include "execute-serialize.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "glob-util.h" +#include "hexdecoct.h" +#include "ioprio-util.h" +#include "lock-util.h" +#include "log.h" +#include "macro.h" +#include "manager.h" +#include "manager-dump.h" +#include "memory-util.h" +#include "missing_fs.h" +#include "missing_prctl.h" +#include "mkdir-label.h" +#include "namespace.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "seccomp-util.h" +#include "securebits-util.h" +#include "selinux-util.h" +#include "serialize.h" +#include "sort-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "unit-serialize.h" +#include "user-util.h" +#include "utmp-wtmp.h" + +static bool is_terminal_input(ExecInput i) { + return IN_SET(i, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL); +} + +static bool is_terminal_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE); +} + +const char *exec_context_tty_path(const ExecContext *context) { + assert(context); + + if (context->stdio_as_fds) + return NULL; + + if (context->tty_path) + return context->tty_path; + + return "/dev/console"; +} + +static void exec_context_determine_tty_size( + const ExecContext *context, + const char *tty_path, + unsigned *ret_rows, + unsigned *ret_cols) { + + unsigned rows, cols; + + assert(context); + assert(ret_rows); + assert(ret_cols); + + if (!tty_path) + tty_path = exec_context_tty_path(context); + + rows = context->tty_rows; + cols = context->tty_cols; + + if (tty_path && (rows == UINT_MAX || cols == UINT_MAX)) + (void) proc_cmdline_tty_size( + tty_path, + rows == UINT_MAX ? &rows : NULL, + cols == UINT_MAX ? &cols : NULL); + + *ret_rows = rows; + *ret_cols = cols; +} + +int exec_context_apply_tty_size( + const ExecContext *context, + int tty_fd, + const char *tty_path) { + + unsigned rows, cols; + + exec_context_determine_tty_size(context, tty_path, &rows, &cols); + + return terminal_set_size_fd(tty_fd, tty_path, rows, cols); + } + +void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) { + _cleanup_close_ int _fd = -EBADF, lock_fd = -EBADF; + int fd; + + assert(context); + + const char *path = exec_context_tty_path(context); + + if (p && p->stdin_fd >= 0 && isatty(p->stdin_fd)) + fd = p->stdin_fd; + else if (path && (context->tty_path || is_terminal_input(context->std_input) || + is_terminal_output(context->std_output) || is_terminal_output(context->std_error))) { + fd = _fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return (void) log_debug_errno(fd, "Failed to open terminal '%s', ignoring: %m", path); + } else + return; /* nothing to do */ + + /* Take a synchronization lock for the duration of the setup that we do here. + * systemd-vconsole-setup.service also takes the lock to avoid being interrupted. We open a new fd + * that will be closed automatically, and operate on it for convenience. */ + lock_fd = lock_dev_console(); + if (ERRNO_IS_NEG_PRIVILEGE(lock_fd)) + log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without: %m"); + else if (lock_fd < 0) + return (void) log_debug_errno(lock_fd, "Failed to lock /dev/console: %m"); + + if (context->tty_vhangup) + (void) terminal_vhangup_fd(fd); + + if (context->tty_reset) + (void) reset_terminal_fd(fd, /* switch_to_text= */ true); + + (void) exec_context_apply_tty_size(context, fd, path); + + if (context->tty_vt_disallocate && path) + (void) vt_disallocate(path); +} + +bool exec_needs_network_namespace(const ExecContext *context) { + assert(context); + + return context->private_network || context->network_namespace_path; +} + +static bool exec_needs_ephemeral(const ExecContext *context) { + return (context->root_image || context->root_directory) && context->root_ephemeral; +} + +bool exec_needs_ipc_namespace(const ExecContext *context) { + assert(context); + + return context->private_ipc || context->ipc_namespace_path; +} + +bool exec_needs_mount_namespace( + const ExecContext *context, + const ExecParameters *params, + const ExecRuntime *runtime) { + + assert(context); + + if (context->root_image) + return true; + + if (!strv_isempty(context->read_write_paths) || + !strv_isempty(context->read_only_paths) || + !strv_isempty(context->inaccessible_paths) || + !strv_isempty(context->exec_paths) || + !strv_isempty(context->no_exec_paths)) + return true; + + if (context->n_bind_mounts > 0) + return true; + + if (context->n_temporary_filesystems > 0) + return true; + + if (context->n_mount_images > 0) + return true; + + if (context->n_extension_images > 0) + return true; + + if (!strv_isempty(context->extension_directories)) + return true; + + if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED)) + return true; + + if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir)) + return true; + + if (context->private_devices || + context->private_mounts > 0 || + (context->private_mounts < 0 && exec_needs_network_namespace(context)) || + context->protect_system != PROTECT_SYSTEM_NO || + context->protect_home != PROTECT_HOME_NO || + context->protect_kernel_tunables || + context->protect_kernel_modules || + context->protect_kernel_logs || + context->protect_control_groups || + context->protect_proc != PROTECT_PROC_DEFAULT || + context->proc_subset != PROC_SUBSET_ALL || + exec_needs_ipc_namespace(context)) + return true; + + if (context->root_directory) { + if (exec_context_get_effective_mount_apivfs(context)) + return true; + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (params && !params->prefix[t]) + continue; + + if (context->directories[t].n_items > 0) + return true; + } + } + + if (context->dynamic_user && + (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 || + context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 || + context->directories[EXEC_DIRECTORY_LOGS].n_items > 0)) + return true; + + if (context->log_namespace) + return true; + + return false; +} + +bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) { + assert(context); + + if (!context->dynamic_user) + return false; + + if (type == EXEC_DIRECTORY_CONFIGURATION) + return false; + + if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO) + return false; + + return true; +} + +int exec_params_get_cgroup_path( + const ExecParameters *params, + const CGroupContext *c, + char **ret) { + + const char *subgroup = NULL; + char *p; + + assert(params); + assert(ret); + + if (!params->cgroup_path) + return -EINVAL; + + /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated + * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control + * processes started after the main unit's process in the unit's main cgroup because it is now an inner one, + * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process, + * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=, + * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre= + * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP + * flag, which is only passed for the former statements, not for the latter. */ + + if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) { + if (FLAGS_SET(params->flags, EXEC_IS_CONTROL)) + subgroup = ".control"; + else + subgroup = c->delegate_subgroup; + } + + if (subgroup) + p = path_join(params->cgroup_path, subgroup); + else + p = strdup(params->cgroup_path); + if (!p) + return -ENOMEM; + + *ret = p; + return !!subgroup; +} + +bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) { + assert(c); + + return c->cpu_affinity_from_numa; +} + +static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) { + assert(unit); + assert(msg); + assert(executable); + + if (!DEBUG_LOGGING) + return; + + _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY); + + log_unit_struct(unit, LOG_DEBUG, + "EXECUTABLE=%s", executable, + LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)), + LOG_UNIT_INVOCATION_ID(unit)); +} + +static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); + +int exec_spawn(Unit *unit, + ExecCommand *command, + const ExecContext *context, + ExecParameters *params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + pid_t *ret) { + + char serialization_fd_number[DECIMAL_STR_MAX(int) + 1]; + _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_fclose_ FILE *f = NULL; + pid_t pid; + int r; + + assert(unit); + assert(unit->manager); + assert(unit->manager->executor_fd >= 0); + assert(command); + assert(context); + assert(ret); + assert(params); + assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0)); + assert(!params->files_env); /* We fill this field, ensure it comes NULL-initialized to us */ + + LOG_CONTEXT_PUSH_UNIT(unit); + + r = exec_context_load_environment(unit, context, ¶ms->files_env); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to load environment files: %m"); + + /* We won't know the real executable path until we create the mount namespace in the child, but we + want to log from the parent, so we use the possibly inaccurate path here. */ + log_command_line(unit, "About to execute", command->path, command->argv); + + if (params->cgroup_path) { + r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); + if (r > 0) { + /* If there's a subcgroup, then let's create it here now (the main cgroup was already + * realized by the unit logic) */ + + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path); + } + } + + /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the + * child's memory.max, serialize all the state needed to start the unit, and pass it to the + * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec + * and ensure all memory is shared. The child immediately execs the new binary so the delay should + * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the + * target cgroup. */ + + r = open_serialization_file("sd-executor-state", &f); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to open serialization stream: %m"); + + fdset = fdset_new(); + if (!fdset) + return log_oom(); + + r = exec_serialize_invocation(f, fdset, context, command, params, runtime, cgroup_context); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to serialize parameters: %m"); + + if (fseeko(f, 0, SEEK_SET) < 0) + return log_unit_error_errno(unit, errno, "Failed to reseek on serialization stream: %m"); + + r = fd_cloexec(fileno(f), false); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialization fd: %m"); + + r = fdset_cloexec(fdset, false); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m"); + + r = log_level_to_string_alloc(log_get_max_level(), &log_level); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m"); + + r = fd_get_path(unit->manager->executor_fd, &executor_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m"); + + xsprintf(serialization_fd_number, "%i", fileno(f)); + + /* The executor binary is pinned, to avoid compatibility problems during upgrades. */ + r = posix_spawn_wrapper( + FORMAT_PROC_FD_PATH(unit->manager->executor_fd), + STRV_MAKE(executor_path, + "--deserialize", serialization_fd_number, + "--log-level", log_level, + "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))), + environ, + &pid); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to spawn executor: %m"); + + log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid); + + /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever + * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the + * process will be killed too). */ + if (subcgroup_path) + (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid); + + exec_status_start(&command->exec_status, pid); + + *ret = pid; + return 0; +} + +void exec_context_init(ExecContext *c) { + assert(c); + + /* When initializing a bool member to 'true', make sure to serialize in execute-serialize.c using + * serialize_bool() instead of serialize_bool_elide(). */ + + *c = (ExecContext) { + .umask = 0022, + .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO, + .cpu_sched_policy = SCHED_OTHER, + .syslog_priority = LOG_DAEMON|LOG_INFO, + .syslog_level_prefix = true, + .ignore_sigpipe = true, + .timer_slack_nsec = NSEC_INFINITY, + .personality = PERSONALITY_INVALID, + .timeout_clean_usec = USEC_INFINITY, + .capability_bounding_set = CAP_MASK_UNSET, + .restrict_namespaces = NAMESPACE_FLAGS_INITIAL, + .log_level_max = -1, +#if HAVE_SECCOMP + .syscall_errno = SECCOMP_ERROR_NUMBER_KILL, +#endif + .tty_rows = UINT_MAX, + .tty_cols = UINT_MAX, + .private_mounts = -1, + .memory_ksm = -1, + .set_login_environment = -1, + }; + + FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX) + d->mode = 0755; + + numa_policy_reset(&c->numa_policy); + + assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL); +} + +void exec_context_done(ExecContext *c) { + assert(c); + + c->environment = strv_free(c->environment); + c->environment_files = strv_free(c->environment_files); + c->pass_environment = strv_free(c->pass_environment); + c->unset_environment = strv_free(c->unset_environment); + + rlimit_free_all(c->rlimit); + + for (size_t l = 0; l < 3; l++) { + c->stdio_fdname[l] = mfree(c->stdio_fdname[l]); + c->stdio_file[l] = mfree(c->stdio_file[l]); + } + + c->working_directory = mfree(c->working_directory); + c->root_directory = mfree(c->root_directory); + c->root_image = mfree(c->root_image); + c->root_image_options = mount_options_free_all(c->root_image_options); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + c->root_hash_path = mfree(c->root_hash_path); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + c->root_verity = mfree(c->root_verity); + c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images); + c->extension_directories = strv_free(c->extension_directories); + c->tty_path = mfree(c->tty_path); + c->syslog_identifier = mfree(c->syslog_identifier); + c->user = mfree(c->user); + c->group = mfree(c->group); + + c->supplementary_groups = strv_free(c->supplementary_groups); + + c->pam_name = mfree(c->pam_name); + + c->read_only_paths = strv_free(c->read_only_paths); + c->read_write_paths = strv_free(c->read_write_paths); + c->inaccessible_paths = strv_free(c->inaccessible_paths); + c->exec_paths = strv_free(c->exec_paths); + c->no_exec_paths = strv_free(c->no_exec_paths); + c->exec_search_path = strv_free(c->exec_search_path); + + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images); + + cpu_set_reset(&c->cpu_set); + numa_policy_reset(&c->numa_policy); + + c->utmp_id = mfree(c->utmp_id); + c->selinux_context = mfree(c->selinux_context); + c->apparmor_profile = mfree(c->apparmor_profile); + c->smack_process_label = mfree(c->smack_process_label); + + c->restrict_filesystems = set_free_free(c->restrict_filesystems); + + c->syscall_filter = hashmap_free(c->syscall_filter); + c->syscall_archs = set_free(c->syscall_archs); + c->address_families = set_free(c->address_families); + + FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX) + exec_directory_done(d); + + c->log_level_max = -1; + + exec_context_free_log_extra_fields(c); + c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns); + c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns); + + c->log_ratelimit_interval_usec = 0; + c->log_ratelimit_burst = 0; + + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + + c->network_namespace_path = mfree(c->network_namespace_path); + c->ipc_namespace_path = mfree(c->ipc_namespace_path); + + c->log_namespace = mfree(c->log_namespace); + + c->load_credentials = hashmap_free(c->load_credentials); + c->set_credentials = hashmap_free(c->set_credentials); + c->import_credentials = set_free_free(c->import_credentials); + + c->root_image_policy = image_policy_free(c->root_image_policy); + c->mount_image_policy = image_policy_free(c->mount_image_policy); + c->extension_image_policy = image_policy_free(c->extension_image_policy); +} + +int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) { + assert(c); + + if (!runtime_prefix) + return 0; + + FOREACH_ARRAY(i, c->directories[EXEC_DIRECTORY_RUNTIME].items, c->directories[EXEC_DIRECTORY_RUNTIME].n_items) { + _cleanup_free_ char *p = NULL; + + if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME)) + p = path_join(runtime_prefix, "private", i->path); + else + p = path_join(runtime_prefix, i->path); + if (!p) + return -ENOMEM; + + /* We execute this synchronously, since we need to be sure this is gone when we start the + * service next. */ + (void) rm_rf(p, REMOVE_ROOT); + + STRV_FOREACH(symlink, i->symlinks) { + _cleanup_free_ char *symlink_abs = NULL; + + if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME)) + symlink_abs = path_join(runtime_prefix, "private", *symlink); + else + symlink_abs = path_join(runtime_prefix, *symlink); + if (!symlink_abs) + return -ENOMEM; + + (void) unlink(symlink_abs); + } + } + + return 0; +} + +int exec_context_destroy_mount_ns_dir(Unit *u) { + _cleanup_free_ char *p = NULL; + + if (!u || !MANAGER_IS_SYSTEM(u->manager)) + return 0; + + p = path_join("/run/systemd/propagate/", u->id); + if (!p) + return -ENOMEM; + + /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/ + if (rmdir(p) < 0 && errno != ENOENT) + log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p); + + return 0; +} + +void exec_command_done(ExecCommand *c) { + assert(c); + + c->path = mfree(c->path); + c->argv = strv_free(c->argv); +} + +void exec_command_done_array(ExecCommand *c, size_t n) { + FOREACH_ARRAY(i, c, n) + exec_command_done(i); +} + +ExecCommand* exec_command_free_list(ExecCommand *c) { + ExecCommand *i; + + while ((i = LIST_POP(command, c))) { + exec_command_done(i); + free(i); + } + + return NULL; +} + +void exec_command_free_array(ExecCommand **c, size_t n) { + FOREACH_ARRAY(i, c, n) + *i = exec_command_free_list(*i); +} + +void exec_command_reset_status_array(ExecCommand *c, size_t n) { + FOREACH_ARRAY(i, c, n) + exec_status_reset(&i->exec_status); +} + +void exec_command_reset_status_list_array(ExecCommand **c, size_t n) { + FOREACH_ARRAY(i, c, n) + LIST_FOREACH(command, z, *i) + exec_status_reset(&z->exec_status); +} + +typedef struct InvalidEnvInfo { + const Unit *unit; + const char *path; +} InvalidEnvInfo; + +static void invalid_env(const char *p, void *userdata) { + InvalidEnvInfo *info = userdata; + + log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path); +} + +const char* exec_context_fdname(const ExecContext *c, int fd_index) { + assert(c); + + switch (fd_index) { + + case STDIN_FILENO: + if (c->std_input != EXEC_INPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDIN_FILENO] ?: "stdin"; + + case STDOUT_FILENO: + if (c->std_output != EXEC_OUTPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDOUT_FILENO] ?: "stdout"; + + case STDERR_FILENO: + if (c->std_error != EXEC_OUTPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDERR_FILENO] ?: "stderr"; + + default: + return NULL; + } +} + +static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) { + _cleanup_strv_free_ char **v = NULL; + int r; + + assert(c); + assert(ret); + + STRV_FOREACH(i, c->environment_files) { + _cleanup_globfree_ glob_t pglob = {}; + bool ignore = false; + char *fn = *i; + + if (fn[0] == '-') { + ignore = true; + fn++; + } + + if (!path_is_absolute(fn)) { + if (ignore) + continue; + return -EINVAL; + } + + /* Filename supports globbing, take all matching files */ + r = safe_glob(fn, 0, &pglob); + if (r < 0) { + if (ignore) + continue; + return r; + } + + /* When we don't match anything, -ENOENT should be returned */ + assert(pglob.gl_pathc > 0); + + FOREACH_ARRAY(path, pglob.gl_pathv, pglob.gl_pathc) { + _cleanup_strv_free_ char **p = NULL; + + r = load_env_file(NULL, *path, &p); + if (r < 0) { + if (ignore) + continue; + return r; + } + + /* Log invalid environment variables with filename */ + if (p) { + InvalidEnvInfo info = { + .unit = unit, + .path = *path, + }; + + p = strv_env_clean_with_callback(p, invalid_env, &info); + } + + if (!v) + v = TAKE_PTR(p); + else { + char **m = strv_env_merge(v, p); + if (!m) + return -ENOMEM; + + strv_free_and_replace(v, m); + } + } + } + + *ret = TAKE_PTR(v); + + return 0; +} + +static bool tty_may_match_dev_console(const char *tty) { + _cleanup_free_ char *resolved = NULL; + + if (!tty) + return true; + + tty = skip_dev_prefix(tty); + + /* trivial identity? */ + if (streq(tty, "console")) + return true; + + if (resolve_dev_console(&resolved) < 0) + return true; /* if we could not resolve, assume it may */ + + /* "tty0" means the active VC, so it may be the same sometimes */ + return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty)); +} + +static bool exec_context_may_touch_tty(const ExecContext *ec) { + assert(ec); + + return ec->tty_reset || + ec->tty_vhangup || + ec->tty_vt_disallocate || + is_terminal_input(ec->std_input) || + is_terminal_output(ec->std_output) || + is_terminal_output(ec->std_error); +} + +bool exec_context_may_touch_console(const ExecContext *ec) { + + return exec_context_may_touch_tty(ec) && + tty_may_match_dev_console(exec_context_tty_path(ec)); +} + +static void strv_fprintf(FILE *f, char **l) { + assert(f); + + STRV_FOREACH(g, l) + fprintf(f, " %s", *g); +} + +static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) { + assert(f); + assert(prefix); + assert(name); + + if (!strv_isempty(strv)) { + fprintf(f, "%s%s:", prefix, name); + strv_fprintf(f, strv); + fputs("\n", f); + } +} + +void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix) { + assert(p); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%sRuntimeScope: %s\n" + "%sExecFlags: %u\n" + "%sSELinuxContextNetwork: %s\n" + "%sCgroupSupportedMask: %u\n" + "%sCgroupPath: %s\n" + "%sCrededentialsDirectory: %s\n" + "%sEncryptedCredentialsDirectory: %s\n" + "%sConfirmSpawn: %s\n" + "%sShallConfirmSpawn: %s\n" + "%sWatchdogUSec: " USEC_FMT "\n" + "%sNotifySocket: %s\n" + "%sFallbackSmackProcessLabel: %s\n", + prefix, runtime_scope_to_string(p->runtime_scope), + prefix, p->flags, + prefix, yes_no(p->selinux_context_net), + prefix, p->cgroup_supported, + prefix, p->cgroup_path, + prefix, strempty(p->received_credentials_directory), + prefix, strempty(p->received_encrypted_credentials_directory), + prefix, strempty(p->confirm_spawn), + prefix, yes_no(p->shall_confirm_spawn), + prefix, p->watchdog_usec, + prefix, strempty(p->notify_socket), + prefix, strempty(p->fallback_smack_process_label)); + + strv_dump(f, prefix, "FdNames", p->fd_names); + strv_dump(f, prefix, "Environment", p->environment); + strv_dump(f, prefix, "Prefix", p->prefix); + + LIST_FOREACH(open_files, file, p->open_files) + fprintf(f, "%sOpenFile: %s %s", prefix, file->path, open_file_flags_to_string(file->flags)); + + strv_dump(f, prefix, "FilesEnv", p->files_env); +} + +void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { + int r; + + assert(c); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%sUMask: %04o\n" + "%sWorkingDirectory: %s\n" + "%sRootDirectory: %s\n" + "%sRootEphemeral: %s\n" + "%sNonBlocking: %s\n" + "%sPrivateTmp: %s\n" + "%sPrivateDevices: %s\n" + "%sProtectKernelTunables: %s\n" + "%sProtectKernelModules: %s\n" + "%sProtectKernelLogs: %s\n" + "%sProtectClock: %s\n" + "%sProtectControlGroups: %s\n" + "%sPrivateNetwork: %s\n" + "%sPrivateUsers: %s\n" + "%sProtectHome: %s\n" + "%sProtectSystem: %s\n" + "%sMountAPIVFS: %s\n" + "%sIgnoreSIGPIPE: %s\n" + "%sMemoryDenyWriteExecute: %s\n" + "%sRestrictRealtime: %s\n" + "%sRestrictSUIDSGID: %s\n" + "%sKeyringMode: %s\n" + "%sProtectHostname: %s\n" + "%sProtectProc: %s\n" + "%sProcSubset: %s\n", + prefix, c->umask, + prefix, empty_to_root(c->working_directory), + prefix, empty_to_root(c->root_directory), + prefix, yes_no(c->root_ephemeral), + prefix, yes_no(c->non_blocking), + prefix, yes_no(c->private_tmp), + prefix, yes_no(c->private_devices), + prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_kernel_modules), + prefix, yes_no(c->protect_kernel_logs), + prefix, yes_no(c->protect_clock), + prefix, yes_no(c->protect_control_groups), + prefix, yes_no(c->private_network), + prefix, yes_no(c->private_users), + prefix, protect_home_to_string(c->protect_home), + prefix, protect_system_to_string(c->protect_system), + prefix, yes_no(exec_context_get_effective_mount_apivfs(c)), + prefix, yes_no(c->ignore_sigpipe), + prefix, yes_no(c->memory_deny_write_execute), + prefix, yes_no(c->restrict_realtime), + prefix, yes_no(c->restrict_suid_sgid), + prefix, exec_keyring_mode_to_string(c->keyring_mode), + prefix, yes_no(c->protect_hostname), + prefix, protect_proc_to_string(c->protect_proc), + prefix, proc_subset_to_string(c->proc_subset)); + + if (c->set_login_environment >= 0) + fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0)); + + if (c->root_image) + fprintf(f, "%sRootImage: %s\n", prefix, c->root_image); + + if (c->root_image_options) { + fprintf(f, "%sRootImageOptions:", prefix); + LIST_FOREACH(mount_options, o, c->root_image_options) + if (!isempty(o->options)) + fprintf(f, " %s:%s", + partition_designator_to_string(o->partition_designator), + o->options); + fprintf(f, "\n"); + } + + if (c->root_hash) { + _cleanup_free_ char *encoded = NULL; + encoded = hexmem(c->root_hash, c->root_hash_size); + if (encoded) + fprintf(f, "%sRootHash: %s\n", prefix, encoded); + } + + if (c->root_hash_path) + fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path); + + if (c->root_hash_sig) { + _cleanup_free_ char *encoded = NULL; + ssize_t len; + len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded); + if (len) + fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded); + } + + if (c->root_hash_sig_path) + fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path); + + if (c->root_verity) + fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity); + + STRV_FOREACH(e, c->environment) + fprintf(f, "%sEnvironment: %s\n", prefix, *e); + + STRV_FOREACH(e, c->environment_files) + fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e); + + STRV_FOREACH(e, c->pass_environment) + fprintf(f, "%sPassEnvironment: %s\n", prefix, *e); + + STRV_FOREACH(e, c->unset_environment) + fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e); + + fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode)); + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode); + + for (size_t i = 0; i < c->directories[dt].n_items; i++) { + fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path); + + STRV_FOREACH(d, c->directories[dt].items[i].symlinks) + fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d); + } + } + + fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC)); + + if (c->memory_ksm >= 0) + fprintf(f, "%sMemoryKSM: %s\n", prefix, yes_no(c->memory_ksm > 0)); + + if (c->nice_set) + fprintf(f, "%sNice: %i\n", prefix, c->nice); + + if (c->oom_score_adjust_set) + fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust); + + if (c->coredump_filter_set) + fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter); + + for (unsigned i = 0; i < RLIM_NLIMITS; i++) + if (c->rlimit[i]) { + fprintf(f, "%sLimit%s: " RLIM_FMT "\n", + prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max); + fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n", + prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur); + } + + if (c->ioprio_set) { + _cleanup_free_ char *class_str = NULL; + + r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str); + if (r >= 0) + fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str); + + fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio)); + } + + if (c->cpu_sched_set) { + _cleanup_free_ char *policy_str = NULL; + + r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str); + if (r >= 0) + fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str); + + fprintf(f, + "%sCPUSchedulingPriority: %i\n" + "%sCPUSchedulingResetOnFork: %s\n", + prefix, c->cpu_sched_priority, + prefix, yes_no(c->cpu_sched_reset_on_fork)); + } + + if (c->cpu_set.set) { + _cleanup_free_ char *affinity = NULL; + + affinity = cpu_set_to_range_string(&c->cpu_set); + fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity); + } + + if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) { + _cleanup_free_ char *nodes = NULL; + + nodes = cpu_set_to_range_string(&c->numa_policy.nodes); + fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy))); + fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes)); + } + + if (c->timer_slack_nsec != NSEC_INFINITY) + fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec); + + fprintf(f, + "%sStandardInput: %s\n" + "%sStandardOutput: %s\n" + "%sStandardError: %s\n", + prefix, exec_input_to_string(c->std_input), + prefix, exec_output_to_string(c->std_output), + prefix, exec_output_to_string(c->std_error)); + + if (c->std_input == EXEC_INPUT_NAMED_FD) + fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]); + if (c->std_output == EXEC_OUTPUT_NAMED_FD) + fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]); + if (c->std_error == EXEC_OUTPUT_NAMED_FD) + fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]); + + if (c->std_input == EXEC_INPUT_FILE) + fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE) + fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE) + fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE) + fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE) + fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + + if (c->tty_path) + fprintf(f, + "%sTTYPath: %s\n" + "%sTTYReset: %s\n" + "%sTTYVHangup: %s\n" + "%sTTYVTDisallocate: %s\n" + "%sTTYRows: %u\n" + "%sTTYColumns: %u\n", + prefix, c->tty_path, + prefix, yes_no(c->tty_reset), + prefix, yes_no(c->tty_vhangup), + prefix, yes_no(c->tty_vt_disallocate), + prefix, c->tty_rows, + prefix, c->tty_cols); + + if (IN_SET(c->std_output, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE) || + IN_SET(c->std_error, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) { + + _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL; + + r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str); + if (r >= 0) + fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str); + + r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str); + if (r >= 0) + fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str); + } + + if (c->log_level_max >= 0) { + _cleanup_free_ char *t = NULL; + + (void) log_level_to_string_alloc(c->log_level_max, &t); + + fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t)); + } + + if (c->log_ratelimit_interval_usec > 0) + fprintf(f, + "%sLogRateLimitIntervalSec: %s\n", + prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC)); + + if (c->log_ratelimit_burst > 0) + fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst); + + if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) { + fprintf(f, "%sLogFilterPatterns:", prefix); + + char *pattern; + SET_FOREACH(pattern, c->log_filter_allowed_patterns) + fprintf(f, " %s", pattern); + SET_FOREACH(pattern, c->log_filter_denied_patterns) + fprintf(f, " ~%s", pattern); + fputc('\n', f); + } + + FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) { + fprintf(f, "%sLogExtraFields: ", prefix); + fwrite(field->iov_base, 1, field->iov_len, f); + fputc('\n', f); + } + + if (c->log_namespace) + fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace); + + if (c->secure_bits) { + _cleanup_free_ char *str = NULL; + + r = secure_bits_to_string_alloc(c->secure_bits, &str); + if (r >= 0) + fprintf(f, "%sSecure Bits: %s\n", prefix, str); + } + + if (c->capability_bounding_set != CAP_MASK_UNSET) { + _cleanup_free_ char *str = NULL; + + r = capability_set_to_string(c->capability_bounding_set, &str); + if (r >= 0) + fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str); + } + + if (c->capability_ambient_set != 0) { + _cleanup_free_ char *str = NULL; + + r = capability_set_to_string(c->capability_ambient_set, &str); + if (r >= 0) + fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str); + } + + if (c->user) + fprintf(f, "%sUser: %s\n", prefix, c->user); + if (c->group) + fprintf(f, "%sGroup: %s\n", prefix, c->group); + + fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user)); + + strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups); + + if (c->pam_name) + fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name); + + strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths); + strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths); + strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths); + strv_dump(f, prefix, "ExecPaths", c->exec_paths); + strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths); + strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path); + + FOREACH_ARRAY(mount, c->bind_mounts, c->n_bind_mounts) + fprintf(f, "%s%s: %s%s:%s:%s\n", prefix, + mount->read_only ? "BindReadOnlyPaths" : "BindPaths", + mount->ignore_enoent ? "-": "", + mount->source, + mount->destination, + mount->recursive ? "rbind" : "norbind"); + + FOREACH_ARRAY(tmpfs, c->temporary_filesystems, c->n_temporary_filesystems) + fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix, + tmpfs->path, + isempty(tmpfs->options) ? "" : ":", + strempty(tmpfs->options)); + + if (c->utmp_id) + fprintf(f, + "%sUtmpIdentifier: %s\n", + prefix, c->utmp_id); + + if (c->selinux_context) + fprintf(f, + "%sSELinuxContext: %s%s\n", + prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context); + + if (c->apparmor_profile) + fprintf(f, + "%sAppArmorProfile: %s%s\n", + prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile); + + if (c->smack_process_label) + fprintf(f, + "%sSmackProcessLabel: %s%s\n", + prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label); + + if (c->personality != PERSONALITY_INVALID) + fprintf(f, + "%sPersonality: %s\n", + prefix, strna(personality_to_string(c->personality))); + + fprintf(f, + "%sLockPersonality: %s\n", + prefix, yes_no(c->lock_personality)); + + if (c->syscall_filter) { + fprintf(f, + "%sSystemCallFilter: ", + prefix); + + if (!c->syscall_allow_list) + fputc('~', f); + +#if HAVE_SECCOMP + void *id, *val; + bool first = true; + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) { + _cleanup_free_ char *name = NULL; + const char *errno_name = NULL; + int num = PTR_TO_INT(val); + + if (first) + first = false; + else + fputc(' ', f); + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + fputs(strna(name), f); + + if (num >= 0) { + errno_name = seccomp_errno_or_action_to_string(num); + if (errno_name) + fprintf(f, ":%s", errno_name); + else + fprintf(f, ":%d", num); + } + } +#endif + + fputc('\n', f); + } + + if (c->syscall_archs) { + fprintf(f, + "%sSystemCallArchitectures:", + prefix); + +#if HAVE_SECCOMP + void *id; + SET_FOREACH(id, c->syscall_archs) + fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1))); +#endif + fputc('\n', f); + } + + if (exec_context_restrict_namespaces_set(c)) { + _cleanup_free_ char *s = NULL; + + r = namespace_flags_to_string(c->restrict_namespaces, &s); + if (r >= 0) + fprintf(f, "%sRestrictNamespaces: %s\n", + prefix, strna(s)); + } + +#if HAVE_LIBBPF + if (exec_context_restrict_filesystems_set(c)) { + char *fs; + SET_FOREACH(fs, c->restrict_filesystems) + fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs); + } +#endif + + if (c->network_namespace_path) + fprintf(f, + "%sNetworkNamespacePath: %s\n", + prefix, c->network_namespace_path); + + if (c->syscall_errno > 0) { + fprintf(f, "%sSystemCallErrorNumber: ", prefix); + +#if HAVE_SECCOMP + const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno); + if (errno_name) + fputs(errno_name, f); + else + fprintf(f, "%d", c->syscall_errno); +#endif + fputc('\n', f); + } + + FOREACH_ARRAY(mount, c->mount_images, c->n_mount_images) { + fprintf(f, "%sMountImages: %s%s:%s", prefix, + mount->ignore_enoent ? "-": "", + mount->source, + mount->destination); + LIST_FOREACH(mount_options, o, mount->mount_options) + fprintf(f, ":%s:%s", + partition_designator_to_string(o->partition_designator), + strempty(o->options)); + fprintf(f, "\n"); + } + + FOREACH_ARRAY(mount, c->extension_images, c->n_extension_images) { + fprintf(f, "%sExtensionImages: %s%s", prefix, + mount->ignore_enoent ? "-": "", + mount->source); + LIST_FOREACH(mount_options, o, mount->mount_options) + fprintf(f, ":%s:%s", + partition_designator_to_string(o->partition_designator), + strempty(o->options)); + fprintf(f, "\n"); + } + + strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories); +} + +bool exec_context_maintains_privileges(const ExecContext *c) { + assert(c); + + /* Returns true if the process forked off would run under + * an unchanged UID or as root. */ + + if (!c->user) + return true; + + if (streq(c->user, "root") || streq(c->user, "0")) + return true; + + return false; +} + +int exec_context_get_effective_ioprio(const ExecContext *c) { + int p; + + assert(c); + + if (c->ioprio_set) + return c->ioprio; + + p = ioprio_get(IOPRIO_WHO_PROCESS, 0); + if (p < 0) + return IOPRIO_DEFAULT_CLASS_AND_PRIO; + + return ioprio_normalize(p); +} + +bool exec_context_get_effective_mount_apivfs(const ExecContext *c) { + assert(c); + + /* Explicit setting wins */ + if (c->mount_apivfs_set) + return c->mount_apivfs; + + /* Default to "yes" if root directory or image are specified */ + if (exec_context_with_rootfs(c)) + return true; + + return false; +} + +void exec_context_free_log_extra_fields(ExecContext *c) { + assert(c); + + FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) + free(field->iov_base); + + c->log_extra_fields = mfree(c->log_extra_fields); + c->n_log_extra_fields = 0; +} + +void exec_context_revert_tty(ExecContext *c) { + _cleanup_close_ int fd = -EBADF; + const char *path; + struct stat st; + int r; + + assert(c); + + /* First, reset the TTY (possibly kicking everybody else from the TTY) */ + exec_context_tty_reset(c, /* parameters= */ NULL); + + /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path + * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed + * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */ + if (!exec_context_may_touch_tty(c)) + return; + + path = exec_context_tty_path(c); + if (!path) + return; + + fd = open(path, O_PATH|O_CLOEXEC); /* Pin the inode */ + if (fd < 0) + return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m", + path); + + if (fstat(fd, &st) < 0) + return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path); + + /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check + * if things are a character device, since a proper check either means we'd have to open the TTY and + * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects + * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother + * with this at all? → https://github.com/systemd/systemd/issues/19213 */ + if (!S_ISCHR(st.st_mode)) + return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path); + + r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID); + if (r < 0) + log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s to " UID_FMT ":" GID_FMT ", ignoring: %m", path, (uid_t) 0, (gid_t) TTY_GID); +} + +int exec_context_get_clean_directories( + ExecContext *c, + char **prefix, + ExecCleanMask mask, + char ***ret) { + + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(c); + assert(prefix); + assert(ret); + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!FLAGS_SET(mask, 1U << t)) + continue; + + if (!prefix[t]) + continue; + + FOREACH_ARRAY(i, c->directories[t].items, c->directories[t].n_items) { + char *j; + + j = path_join(prefix[t], i->path); + if (!j) + return -ENOMEM; + + r = strv_consume(&l, j); + if (r < 0) + return r; + + /* Also remove private directories unconditionally. */ + if (t != EXEC_DIRECTORY_CONFIGURATION) { + j = path_join(prefix[t], "private", i->path); + if (!j) + return -ENOMEM; + + r = strv_consume(&l, j); + if (r < 0) + return r; + } + + STRV_FOREACH(symlink, i->symlinks) { + j = path_join(prefix[t], *symlink); + if (!j) + return -ENOMEM; + + r = strv_consume(&l, j); + if (r < 0) + return r; + } + } + } + + *ret = TAKE_PTR(l); + return 0; +} + +int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) { + ExecCleanMask mask = 0; + + assert(c); + assert(ret); + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) + if (c->directories[t].n_items > 0) + mask |= 1U << t; + + *ret = mask; + return 0; +} + +int exec_context_get_oom_score_adjust(const ExecContext *c) { + int n = 0, r; + + assert(c); + + if (c->oom_score_adjust_set) + return c->oom_score_adjust; + + r = get_oom_score_adjust(&n); + if (r < 0) + log_debug_errno(r, "Failed to read /proc/self/oom_score_adj, ignoring: %m"); + + return n; +} + +uint64_t exec_context_get_coredump_filter(const ExecContext *c) { + _cleanup_free_ char *t = NULL; + uint64_t n = COREDUMP_FILTER_MASK_DEFAULT; + int r; + + assert(c); + + if (c->coredump_filter_set) + return c->coredump_filter; + + r = read_one_line_file("/proc/self/coredump_filter", &t); + if (r < 0) + log_debug_errno(r, "Failed to read /proc/self/coredump_filter, ignoring: %m"); + else { + r = safe_atoux64(t, &n); + if (r < 0) + log_debug_errno(r, "Failed to parse \"%s\" from /proc/self/coredump_filter, ignoring: %m", t); + } + + return n; +} + +int exec_context_get_nice(const ExecContext *c) { + int n; + + assert(c); + + if (c->nice_set) + return c->nice; + + errno = 0; + n = getpriority(PRIO_PROCESS, 0); + if (errno > 0) { + log_debug_errno(errno, "Failed to get process nice value, ignoring: %m"); + n = 0; + } + + return n; +} + +int exec_context_get_cpu_sched_policy(const ExecContext *c) { + int n; + + assert(c); + + if (c->cpu_sched_set) + return c->cpu_sched_policy; + + n = sched_getscheduler(0); + if (n < 0) + log_debug_errno(errno, "Failed to get scheduler policy, ignoring: %m"); + + return n < 0 ? SCHED_OTHER : n; +} + +int exec_context_get_cpu_sched_priority(const ExecContext *c) { + struct sched_param p = {}; + int r; + + assert(c); + + if (c->cpu_sched_set) + return c->cpu_sched_priority; + + r = sched_getparam(0, &p); + if (r < 0) + log_debug_errno(errno, "Failed to get scheduler priority, ignoring: %m"); + + return r >= 0 ? p.sched_priority : 0; +} + +uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c) { + int r; + + assert(c); + + if (c->timer_slack_nsec != NSEC_INFINITY) + return c->timer_slack_nsec; + + r = prctl(PR_GET_TIMERSLACK); + if (r < 0) + log_debug_errno(r, "Failed to get timer slack, ignoring: %m"); + + return (uint64_t) MAX(r, 0); +} + +char** exec_context_get_syscall_filter(const ExecContext *c) { + _cleanup_strv_free_ char **l = NULL; + + assert(c); + +#if HAVE_SECCOMP + void *id, *val; + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) { + _cleanup_free_ char *name = NULL; + const char *e = NULL; + char *s; + int num = PTR_TO_INT(val); + + if (c->syscall_allow_list && num >= 0) + /* syscall with num >= 0 in allow-list is denied. */ + continue; + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + if (!name) + continue; + + if (num >= 0) { + e = seccomp_errno_or_action_to_string(num); + if (e) { + s = strjoin(name, ":", e); + if (!s) + return NULL; + } else { + if (asprintf(&s, "%s:%d", name, num) < 0) + return NULL; + } + } else + s = TAKE_PTR(name); + + if (strv_consume(&l, s) < 0) + return NULL; + } + + strv_sort(l); +#endif + + return l ? TAKE_PTR(l) : strv_new(NULL); +} + +char** exec_context_get_syscall_archs(const ExecContext *c) { + _cleanup_strv_free_ char **l = NULL; + + assert(c); + +#if HAVE_SECCOMP + void *id; + SET_FOREACH(id, c->syscall_archs) { + const char *name; + + name = seccomp_arch_to_string(PTR_TO_UINT32(id) - 1); + if (!name) + continue; + + if (strv_extend(&l, name) < 0) + return NULL; + } + + strv_sort(l); +#endif + + return l ? TAKE_PTR(l) : strv_new(NULL); +} + +char** exec_context_get_syscall_log(const ExecContext *c) { + _cleanup_strv_free_ char **l = NULL; + + assert(c); + +#if HAVE_SECCOMP + void *id, *val; + HASHMAP_FOREACH_KEY(val, id, c->syscall_log) { + char *name = NULL; + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + if (!name) + continue; + + if (strv_consume(&l, name) < 0) + return NULL; + } + + strv_sort(l); +#endif + + return l ? TAKE_PTR(l) : strv_new(NULL); +} + +char** exec_context_get_address_families(const ExecContext *c) { + _cleanup_strv_free_ char **l = NULL; + void *af; + + assert(c); + + SET_FOREACH(af, c->address_families) { + const char *name; + + name = af_to_name(PTR_TO_INT(af)); + if (!name) + continue; + + if (strv_extend(&l, name) < 0) + return NULL; + } + + strv_sort(l); + + return l ? TAKE_PTR(l) : strv_new(NULL); +} + +char** exec_context_get_restrict_filesystems(const ExecContext *c) { + _cleanup_strv_free_ char **l = NULL; + + assert(c); + +#if HAVE_LIBBPF + l = set_get_strv(c->restrict_filesystems); + if (!l) + return NULL; + + strv_sort(l); +#endif + + return l ? TAKE_PTR(l) : strv_new(NULL); +} + +void exec_status_start(ExecStatus *s, pid_t pid) { + assert(s); + + *s = (ExecStatus) { + .pid = pid, + }; + + dual_timestamp_now(&s->start_timestamp); +} + +void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) { + assert(s); + + if (s->pid != pid) + *s = (ExecStatus) { + .pid = pid, + }; + + dual_timestamp_now(&s->exit_timestamp); + + s->code = code; + s->status = status; + + if (context && context->utmp_id) + (void) utmp_put_dead_process(context->utmp_id, pid, code, status); +} + +void exec_status_reset(ExecStatus *s) { + assert(s); + + *s = (ExecStatus) {}; +} + +void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { + assert(s); + assert(f); + + if (s->pid <= 0) + return; + + prefix = strempty(prefix); + + fprintf(f, + "%sPID: "PID_FMT"\n", + prefix, s->pid); + + if (dual_timestamp_is_set(&s->start_timestamp)) + fprintf(f, + "%sStart Timestamp: %s\n", + prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime)); + + if (dual_timestamp_is_set(&s->exit_timestamp)) + fprintf(f, + "%sExit Timestamp: %s\n" + "%sExit Code: %s\n" + "%sExit Status: %i\n", + prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime), + prefix, sigchld_code_to_string(s->code), + prefix, s->status); +} + +static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) { + _cleanup_free_ char *cmd = NULL; + const char *prefix2; + + assert(c); + assert(f); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY); + + fprintf(f, + "%sCommand Line: %s\n", + prefix, strnull(cmd)); + + exec_status_dump(&c->exec_status, f, prefix2); +} + +void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) { + assert(f); + + prefix = strempty(prefix); + + LIST_FOREACH(command, i, c) + exec_command_dump(i, f, prefix); +} + +void exec_command_append_list(ExecCommand **l, ExecCommand *e) { + ExecCommand *end; + + assert(l); + assert(e); + + if (*l) { + /* It's kind of important, that we keep the order here */ + end = LIST_FIND_TAIL(command, *l); + LIST_INSERT_AFTER(command, *l, end, e); + } else + *l = e; +} + +int exec_command_set(ExecCommand *c, const char *path, ...) { + va_list ap; + char **l, *p; + + assert(c); + assert(path); + + va_start(ap, path); + l = strv_new_ap(path, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + + p = strdup(path); + if (!p) { + strv_free(l); + return -ENOMEM; + } + + free_and_replace(c->path, p); + + return strv_free_and_replace(c->argv, l); +} + +int exec_command_append(ExecCommand *c, const char *path, ...) { + _cleanup_strv_free_ char **l = NULL; + va_list ap; + int r; + + assert(c); + assert(path); + + va_start(ap, path); + l = strv_new_ap(path, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + + r = strv_extend_strv(&c->argv, l, false); + if (r < 0) + return r; + + return 0; +} + +static char *destroy_tree(char *path) { + if (!path) + return NULL; + + if (!path_equal(path, RUN_SYSTEMD_EMPTY)) { + log_debug("Spawning process to nuke '%s'", path); + + (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); + } + + return mfree(path); +} + +void exec_shared_runtime_done(ExecSharedRuntime *rt) { + if (!rt) + return; + + if (rt->manager) + (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id); + + rt->id = mfree(rt->id); + rt->tmp_dir = mfree(rt->tmp_dir); + rt->var_tmp_dir = mfree(rt->var_tmp_dir); + safe_close_pair(rt->netns_storage_socket); + safe_close_pair(rt->ipcns_storage_socket); +} + +static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { + exec_shared_runtime_done(rt); + + return mfree(rt); +} + +DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free); + +ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { + if (!rt) + return NULL; + + assert(rt->n_ref > 0); + rt->n_ref--; + + if (rt->n_ref > 0) + return NULL; + + rt->tmp_dir = destroy_tree(rt->tmp_dir); + rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir); + + return exec_shared_runtime_free(rt); +} + +static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) { + _cleanup_free_ char *id_copy = NULL; + ExecSharedRuntime *n; + + assert(ret); + + id_copy = strdup(id); + if (!id_copy) + return -ENOMEM; + + n = new(ExecSharedRuntime, 1); + if (!n) + return -ENOMEM; + + *n = (ExecSharedRuntime) { + .id = TAKE_PTR(id_copy), + .netns_storage_socket = EBADF_PAIR, + .ipcns_storage_socket = EBADF_PAIR, + }; + + *ret = n; + return 0; +} + +static int exec_shared_runtime_add( + Manager *m, + const char *id, + char **tmp_dir, + char **var_tmp_dir, + int netns_storage_socket[2], + int ipcns_storage_socket[2], + ExecSharedRuntime **ret) { + + _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL; + int r; + + assert(m); + assert(id); + + /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */ + + r = exec_shared_runtime_allocate(&rt, id); + if (r < 0) + return r; + + r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt); + if (r < 0) + return r; + + assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */ + rt->tmp_dir = TAKE_PTR(*tmp_dir); + rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir); + + if (netns_storage_socket) { + rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]); + rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]); + } + + if (ipcns_storage_socket) { + rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]); + rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]); + } + + rt->manager = m; + + if (ret) + *ret = rt; + /* do not remove created ExecSharedRuntime object when the operation succeeds. */ + TAKE_PTR(rt); + return 0; +} + +static int exec_shared_runtime_make( + Manager *m, + const ExecContext *c, + const char *id, + ExecSharedRuntime **ret) { + + _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL; + _cleanup_close_pair_ int netns_storage_socket[2] = EBADF_PAIR, ipcns_storage_socket[2] = EBADF_PAIR; + int r; + + assert(m); + assert(c); + assert(id); + + /* It is not necessary to create ExecSharedRuntime object. */ + if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) { + *ret = NULL; + return 0; + } + + if (c->private_tmp && + !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") && + (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") || + prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) { + r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir); + if (r < 0) + return r; + } + + if (exec_needs_network_namespace(c)) { + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0) + return -errno; + } + + if (exec_needs_ipc_namespace(c)) { + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0) + return -errno; + } + + r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret); + if (r < 0) + return r; + + return 1; +} + +int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) { + ExecSharedRuntime *rt; + int r; + + assert(m); + assert(id); + assert(ret); + + rt = hashmap_get(m->exec_shared_runtime_by_id, id); + if (rt) + /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */ + goto ref; + + if (!create) { + *ret = NULL; + return 0; + } + + /* If not found, then create a new object. */ + r = exec_shared_runtime_make(m, c, id, &rt); + if (r < 0) + return r; + if (r == 0) { + /* When r == 0, it is not necessary to create ExecSharedRuntime object. */ + *ret = NULL; + return 0; + } + +ref: + /* increment reference counter. */ + rt->n_ref++; + *ret = rt; + return 1; +} + +int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) { + ExecSharedRuntime *rt; + + assert(m); + assert(f); + assert(fds); + + HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) { + fprintf(f, "exec-runtime=%s", rt->id); + + if (rt->tmp_dir) + fprintf(f, " tmp-dir=%s", rt->tmp_dir); + + if (rt->var_tmp_dir) + fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir); + + if (rt->netns_storage_socket[0] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->netns_storage_socket[0]); + if (copy < 0) + return copy; + + fprintf(f, " netns-socket-0=%i", copy); + } + + if (rt->netns_storage_socket[1] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->netns_storage_socket[1]); + if (copy < 0) + return copy; + + fprintf(f, " netns-socket-1=%i", copy); + } + + if (rt->ipcns_storage_socket[0] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]); + if (copy < 0) + return copy; + + fprintf(f, " ipcns-socket-0=%i", copy); + } + + if (rt->ipcns_storage_socket[1] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]); + if (copy < 0) + return copy; + + fprintf(f, " ipcns-socket-1=%i", copy); + } + + fputc('\n', f); + } + + return 0; +} + +int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) { + _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL; + ExecSharedRuntime *rt = NULL; + int r; + + /* This is for the migration from old (v237 or earlier) deserialization text. + * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=. + * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge + * so or not from the serialized text, then we always creates a new object owned by this. */ + + assert(u); + assert(key); + assert(value); + + /* Manager manages ExecSharedRuntime objects by the unit id. + * So, we omit the serialized text when the unit does not have id (yet?)... */ + if (isempty(u->id)) { + log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter."); + return 0; + } + + if (u->manager) { + if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0) + return log_oom(); + + rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id); + } + if (!rt) { + if (exec_shared_runtime_allocate(&rt_create, u->id) < 0) + return log_oom(); + + rt = rt_create; + } + + if (streq(key, "tmp-dir")) { + if (free_and_strdup_warn(&rt->tmp_dir, value) < 0) + return -ENOMEM; + + } else if (streq(key, "var-tmp-dir")) { + if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0) + return -ENOMEM; + + } else if (streq(key, "netns-socket-0")) { + + safe_close(rt->netns_storage_socket[0]); + rt->netns_storage_socket[0] = deserialize_fd(fds, value); + if (rt->netns_storage_socket[0] < 0) + return 0; + + } else if (streq(key, "netns-socket-1")) { + + safe_close(rt->netns_storage_socket[1]); + rt->netns_storage_socket[1] = deserialize_fd(fds, value); + if (rt->netns_storage_socket[1] < 0) + return 0; + } else + return 0; + + /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */ + if (rt_create && u->manager) { + r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m"); + return 0; + } + + rt_create->manager = u->manager; + + /* Avoid cleanup */ + TAKE_PTR(rt_create); + } + + return 1; +} + +int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) { + _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL; + char *id = NULL; + int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1}; + const char *p, *v = ASSERT_PTR(value); + size_t n; + + assert(m); + assert(fds); + + n = strcspn(v, " "); + id = strndupa_safe(v, n); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + + v = startswith(p, "tmp-dir="); + if (v) { + n = strcspn(v, " "); + tmp_dir = strndup(v, n); + if (!tmp_dir) + return log_oom(); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "var-tmp-dir="); + if (v) { + n = strcspn(v, " "); + var_tmp_dir = strndup(v, n); + if (!var_tmp_dir) + return log_oom(); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "netns-socket-0="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa_safe(v, n); + + netns_fdpair[0] = deserialize_fd(fds, buf); + if (netns_fdpair[0] < 0) + return netns_fdpair[0]; + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "netns-socket-1="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa_safe(v, n); + + netns_fdpair[1] = deserialize_fd(fds, buf); + if (netns_fdpair[1] < 0) + return netns_fdpair[1]; + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "ipcns-socket-0="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa_safe(v, n); + + ipcns_fdpair[0] = deserialize_fd(fds, buf); + if (ipcns_fdpair[0] < 0) + return ipcns_fdpair[0]; + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "ipcns-socket-1="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa_safe(v, n); + + ipcns_fdpair[1] = deserialize_fd(fds, buf); + if (ipcns_fdpair[1] < 0) + return ipcns_fdpair[1]; + } + +finalize: + r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to add exec-runtime: %m"); + return 0; +} + +void exec_shared_runtime_vacuum(Manager *m) { + ExecSharedRuntime *rt; + + assert(m); + + /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */ + + HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) { + if (rt->n_ref > 0) + continue; + + (void) exec_shared_runtime_free(rt); + } +} + +int exec_runtime_make( + const Unit *unit, + const ExecContext *context, + ExecSharedRuntime *shared, + DynamicCreds *creds, + ExecRuntime **ret) { + _cleanup_close_pair_ int ephemeral_storage_socket[2] = EBADF_PAIR; + _cleanup_free_ char *ephemeral = NULL; + _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL; + int r; + + assert(unit); + assert(context); + assert(ret); + + if (!shared && !creds && !exec_needs_ephemeral(context)) { + *ret = NULL; + return 0; + } + + if (exec_needs_ephemeral(context)) { + r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755); + if (r < 0) + return r; + + r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0) + return -errno; + } + + rt = new(ExecRuntime, 1); + if (!rt) + return -ENOMEM; + + *rt = (ExecRuntime) { + .shared = shared, + .dynamic_creds = creds, + .ephemeral_copy = TAKE_PTR(ephemeral), + .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]), + .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]), + }; + + *ret = TAKE_PTR(rt); + return 1; +} + +ExecRuntime* exec_runtime_free(ExecRuntime *rt) { + if (!rt) + return NULL; + + exec_shared_runtime_unref(rt->shared); + dynamic_creds_unref(rt->dynamic_creds); + + rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy); + + safe_close_pair(rt->ephemeral_storage_socket); + return mfree(rt); +} + +ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) { + if (!rt) + return NULL; + + rt->shared = exec_shared_runtime_destroy(rt->shared); + rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds); + return exec_runtime_free(rt); +} + +void exec_runtime_clear(ExecRuntime *rt) { + if (!rt) + return; + + safe_close_pair(rt->ephemeral_storage_socket); + rt->ephemeral_copy = mfree(rt->ephemeral_copy); +} + +void exec_params_shallow_clear(ExecParameters *p) { + if (!p) + return; + + /* This is called on the PID1 side, as many of the struct's FDs are only borrowed, and actually + * owned by the manager or other objects, and reused across multiple units. */ + + p->environment = strv_free(p->environment); + p->fd_names = strv_free(p->fd_names); + p->files_env = strv_free(p->files_env); + p->fds = mfree(p->fds); + p->exec_fd = safe_close(p->exec_fd); + p->user_lookup_fd = -EBADF; + p->bpf_outer_map_fd = -EBADF; + p->unit_id = mfree(p->unit_id); + p->invocation_id = SD_ID128_NULL; + p->invocation_id_string[0] = '\0'; + p->confirm_spawn = mfree(p->confirm_spawn); +} + +void exec_params_deep_clear(ExecParameters *p) { + if (!p) + return; + + /* This is called on the sd-executor side, where everything received is owned by the process and has + * to be fully cleaned up to make sanitizers and analyzers happy, as opposed as the shallow clean + * function above. */ + + close_many_unset(p->fds, p->n_socket_fds + p->n_storage_fds); + + p->cgroup_path = mfree(p->cgroup_path); + + if (p->prefix) { + free_many_charp(p->prefix, _EXEC_DIRECTORY_TYPE_MAX); + p->prefix = mfree(p->prefix); + } + + p->received_credentials_directory = mfree(p->received_credentials_directory); + p->received_encrypted_credentials_directory = mfree(p->received_encrypted_credentials_directory); + + if (p->idle_pipe) { + close_many_and_free(p->idle_pipe, 4); + p->idle_pipe = NULL; + } + + p->stdin_fd = safe_close(p->stdin_fd); + p->stdout_fd = safe_close(p->stdout_fd); + p->stderr_fd = safe_close(p->stderr_fd); + + p->notify_socket = mfree(p->notify_socket); + + open_file_free_many(&p->open_files); + + p->fallback_smack_process_label = mfree(p->fallback_smack_process_label); + + exec_params_shallow_clear(p); +} + +void exec_directory_done(ExecDirectory *d) { + if (!d) + return; + + FOREACH_ARRAY(i, d->items, d->n_items) { + free(i->path); + strv_free(i->symlinks); + } + + d->items = mfree(d->items); + d->n_items = 0; + d->mode = 0755; +} + +static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) { + assert(d); + assert(path); + + FOREACH_ARRAY(i, d->items, d->n_items) + if (path_equal(i->path, path)) + return i; + + return NULL; +} + +int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) { + _cleanup_strv_free_ char **s = NULL; + _cleanup_free_ char *p = NULL; + ExecDirectoryItem *existing; + int r; + + assert(d); + assert(path); + + existing = exec_directory_find(d, path); + if (existing) { + r = strv_extend(&existing->symlinks, symlink); + if (r < 0) + return r; + + return 0; /* existing item is updated */ + } + + p = strdup(path); + if (!p) + return -ENOMEM; + + if (symlink) { + s = strv_new(symlink); + if (!s) + return -ENOMEM; + } + + if (!GREEDY_REALLOC(d->items, d->n_items + 1)) + return -ENOMEM; + + d->items[d->n_items++] = (ExecDirectoryItem) { + .path = TAKE_PTR(p), + .symlinks = TAKE_PTR(s), + }; + + return 1; /* new item is added */ +} + +static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) { + assert(a); + assert(b); + + return path_compare(a->path, b->path); +} + +void exec_directory_sort(ExecDirectory *d) { + assert(d); + + /* Sort the exec directories to make always parent directories processed at first in + * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first, + * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the + * list. See also comments in setup_exec_directory() and issue #24783. */ + + if (d->n_items <= 1) + return; + + typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func); + + for (size_t i = 1; i < d->n_items; i++) + for (size_t j = 0; j < i; j++) + if (path_startswith(d->items[i].path, d->items[j].path)) { + d->items[i].only_create = true; + break; + } +} + +ExecCleanMask exec_clean_mask_from_string(const char *s) { + ExecDirectoryType t; + + assert(s); + + if (streq(s, "all")) + return EXEC_CLEAN_ALL; + if (streq(s, "fdstore")) + return EXEC_CLEAN_FDSTORE; + + t = exec_resource_type_from_string(s); + if (t < 0) + return (ExecCleanMask) t; + + return 1U << t; +} + +static const char* const exec_input_table[_EXEC_INPUT_MAX] = { + [EXEC_INPUT_NULL] = "null", + [EXEC_INPUT_TTY] = "tty", + [EXEC_INPUT_TTY_FORCE] = "tty-force", + [EXEC_INPUT_TTY_FAIL] = "tty-fail", + [EXEC_INPUT_SOCKET] = "socket", + [EXEC_INPUT_NAMED_FD] = "fd", + [EXEC_INPUT_DATA] = "data", + [EXEC_INPUT_FILE] = "file", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput); + +static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = { + [EXEC_OUTPUT_INHERIT] = "inherit", + [EXEC_OUTPUT_NULL] = "null", + [EXEC_OUTPUT_TTY] = "tty", + [EXEC_OUTPUT_KMSG] = "kmsg", + [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console", + [EXEC_OUTPUT_JOURNAL] = "journal", + [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console", + [EXEC_OUTPUT_SOCKET] = "socket", + [EXEC_OUTPUT_NAMED_FD] = "fd", + [EXEC_OUTPUT_FILE] = "file", + [EXEC_OUTPUT_FILE_APPEND] = "append", + [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput); + +static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = { + [EXEC_UTMP_INIT] = "init", + [EXEC_UTMP_LOGIN] = "login", + [EXEC_UTMP_USER] = "user", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode); + +static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = { + [EXEC_PRESERVE_NO] = "no", + [EXEC_PRESERVE_YES] = "yes", + [EXEC_PRESERVE_RESTART] = "restart", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES); + +/* This table maps ExecDirectoryType to the setting it is configured with in the unit */ +static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory", + [EXEC_DIRECTORY_STATE] = "StateDirectory", + [EXEC_DIRECTORY_CACHE] = "CacheDirectory", + [EXEC_DIRECTORY_LOGS] = "LogsDirectory", + [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType); + +/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */ +static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink", + [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink", + [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink", + [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink", + [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType); + +static const char* const exec_directory_type_mode_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectoryMode", + [EXEC_DIRECTORY_STATE] = "StateDirectoryMode", + [EXEC_DIRECTORY_CACHE] = "CacheDirectoryMode", + [EXEC_DIRECTORY_LOGS] = "LogsDirectoryMode", + [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectoryMode", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_mode, ExecDirectoryType); + +/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This + * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit + * directories, specifically .timer units with their timestamp touch file. */ +static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "runtime", + [EXEC_DIRECTORY_STATE] = "state", + [EXEC_DIRECTORY_CACHE] = "cache", + [EXEC_DIRECTORY_LOGS] = "logs", + [EXEC_DIRECTORY_CONFIGURATION] = "configuration", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType); + +static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = { + [EXEC_KEYRING_INHERIT] = "inherit", + [EXEC_KEYRING_PRIVATE] = "private", + [EXEC_KEYRING_SHARED] = "shared", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode); diff --git a/src/core/execute.h b/src/core/execute.h new file mode 100644 index 0000000..5a6927a --- /dev/null +++ b/src/core/execute.h @@ -0,0 +1,701 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct ExecStatus ExecStatus; +typedef struct ExecCommand ExecCommand; +typedef struct ExecContext ExecContext; +typedef struct ExecSharedRuntime ExecSharedRuntime; +typedef struct DynamicCreds DynamicCreds; +typedef struct ExecRuntime ExecRuntime; +typedef struct ExecParameters ExecParameters; +typedef struct Manager Manager; + +#include +#include +#include +#include + +#include "cgroup-util.h" +#include "coredump-util.h" +#include "cpu-set-util.h" +#include "exec-util.h" +#include "fdset.h" +#include "list.h" +#include "missing_resource.h" +#include "namespace.h" +#include "nsflags.h" +#include "numa-util.h" +#include "open-file.h" +#include "path-util.h" +#include "runtime-scope.h" +#include "set.h" +#include "time-util.h" + +#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U) + +typedef enum ExecUtmpMode { + EXEC_UTMP_INIT, + EXEC_UTMP_LOGIN, + EXEC_UTMP_USER, + _EXEC_UTMP_MODE_MAX, + _EXEC_UTMP_MODE_INVALID = -EINVAL, +} ExecUtmpMode; + +typedef enum ExecInput { + EXEC_INPUT_NULL, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL, + EXEC_INPUT_SOCKET, + EXEC_INPUT_NAMED_FD, + EXEC_INPUT_DATA, + EXEC_INPUT_FILE, + _EXEC_INPUT_MAX, + _EXEC_INPUT_INVALID = -EINVAL, +} ExecInput; + +typedef enum ExecOutput { + EXEC_OUTPUT_INHERIT, + EXEC_OUTPUT_NULL, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_SOCKET, + EXEC_OUTPUT_NAMED_FD, + EXEC_OUTPUT_FILE, + EXEC_OUTPUT_FILE_APPEND, + EXEC_OUTPUT_FILE_TRUNCATE, + _EXEC_OUTPUT_MAX, + _EXEC_OUTPUT_INVALID = -EINVAL, +} ExecOutput; + +typedef enum ExecPreserveMode { + EXEC_PRESERVE_NO, + EXEC_PRESERVE_YES, + EXEC_PRESERVE_RESTART, + _EXEC_PRESERVE_MODE_MAX, + _EXEC_PRESERVE_MODE_INVALID = -EINVAL, +} ExecPreserveMode; + +typedef enum ExecKeyringMode { + EXEC_KEYRING_INHERIT, + EXEC_KEYRING_PRIVATE, + EXEC_KEYRING_SHARED, + _EXEC_KEYRING_MODE_MAX, + _EXEC_KEYRING_MODE_INVALID = -EINVAL, +} ExecKeyringMode; + +/* Contains start and exit information about an executed command. */ +struct ExecStatus { + dual_timestamp start_timestamp; + dual_timestamp exit_timestamp; + pid_t pid; + int code; /* as in siginfo_t::si_code */ + int status; /* as in siginfo_t::si_status */ +}; + +/* Stores information about commands we execute. Covers both configuration settings as well as runtime data. */ +struct ExecCommand { + char *path; + char **argv; + ExecStatus exec_status; /* Note that this is not serialized to sd-executor */ + ExecCommandFlags flags; + LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */ +}; + +/* Encapsulates certain aspects of the runtime environment that is to be shared between multiple otherwise separate + * invocations of commands. Specifically, this allows sharing of /tmp and /var/tmp data as well as network namespaces + * between invocations of commands. This is a reference counted object, with one reference taken by each currently + * active command invocation that wants to share this runtime. */ +struct ExecSharedRuntime { + unsigned n_ref; + + Manager *manager; + + char *id; /* Unit id of the owner */ + + char *tmp_dir; + char *var_tmp_dir; + + /* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network + * namespace. */ + int netns_storage_socket[2]; + + /* Like netns_storage_socket, but the file descriptor is referring to the IPC namespace. */ + int ipcns_storage_socket[2]; +}; + +struct ExecRuntime { + ExecSharedRuntime *shared; + DynamicCreds *dynamic_creds; + + /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */ + char *ephemeral_copy; + + /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of + * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot + * until we're done using it. */ + int ephemeral_storage_socket[2]; +}; + +typedef enum ExecDirectoryType { + EXEC_DIRECTORY_RUNTIME, + EXEC_DIRECTORY_STATE, + EXEC_DIRECTORY_CACHE, + EXEC_DIRECTORY_LOGS, + EXEC_DIRECTORY_CONFIGURATION, + _EXEC_DIRECTORY_TYPE_MAX, + _EXEC_DIRECTORY_TYPE_INVALID = -EINVAL, +} ExecDirectoryType; + +typedef struct ExecDirectoryItem { + char *path; + char **symlinks; + bool only_create; +} ExecDirectoryItem; + +typedef struct ExecDirectory { + mode_t mode; + size_t n_items; + ExecDirectoryItem *items; +} ExecDirectory; + +typedef enum ExecCleanMask { + /* In case you wonder why the bitmask below doesn't use "directory" in its name: we want to keep this + * generic so that .timer timestamp files can nicely be covered by this too, and similar. */ + EXEC_CLEAN_RUNTIME = 1U << EXEC_DIRECTORY_RUNTIME, + EXEC_CLEAN_STATE = 1U << EXEC_DIRECTORY_STATE, + EXEC_CLEAN_CACHE = 1U << EXEC_DIRECTORY_CACHE, + EXEC_CLEAN_LOGS = 1U << EXEC_DIRECTORY_LOGS, + EXEC_CLEAN_CONFIGURATION = 1U << EXEC_DIRECTORY_CONFIGURATION, + EXEC_CLEAN_FDSTORE = 1U << _EXEC_DIRECTORY_TYPE_MAX, + EXEC_CLEAN_NONE = 0, + EXEC_CLEAN_ALL = (1U << (_EXEC_DIRECTORY_TYPE_MAX+1)) - 1, + _EXEC_CLEAN_MASK_INVALID = -EINVAL, +} ExecCleanMask; + +/* Encodes configuration parameters applied to invoked commands. Does not carry runtime data, but only configuration + * changes sourced from unit files and suchlike. ExecContext objects are usually embedded into Unit objects, and do not + * change after being loaded. */ +struct ExecContext { + char **environment; + char **environment_files; + char **pass_environment; + char **unset_environment; + + struct rlimit *rlimit[_RLIMIT_MAX]; + char *working_directory, *root_directory, *root_image, *root_verity, *root_hash_path, *root_hash_sig_path; + void *root_hash, *root_hash_sig; + size_t root_hash_size, root_hash_sig_size; + LIST_HEAD(MountOptions, root_image_options); + bool root_ephemeral; + bool working_directory_missing_ok:1; + bool working_directory_home:1; + + bool oom_score_adjust_set:1; + bool coredump_filter_set:1; + bool nice_set:1; + bool ioprio_set:1; + bool cpu_sched_set:1; + bool mount_apivfs_set:1; + + /* This is not exposed to the user but available internally. We need it to make sure that whenever we + * spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects + * that it belongs to us and we don't enter a trigger loop. */ + bool same_pgrp; + + bool cpu_sched_reset_on_fork; + bool non_blocking; + + mode_t umask; + int oom_score_adjust; + int nice; + int ioprio; + int cpu_sched_policy; + int cpu_sched_priority; + uint64_t coredump_filter; + + CPUSet cpu_set; + NUMAPolicy numa_policy; + bool cpu_affinity_from_numa; + + ExecInput std_input; + ExecOutput std_output; + ExecOutput std_error; + + /* At least one of stdin/stdout/stderr was initialized from an fd passed in. This boolean survives + * the fds being closed. This only makes sense for transient units. */ + bool stdio_as_fds; + + char *stdio_fdname[3]; + char *stdio_file[3]; + + void *stdin_data; + size_t stdin_data_size; + + nsec_t timer_slack_nsec; + + char *tty_path; + + bool tty_reset; + bool tty_vhangup; + bool tty_vt_disallocate; + + unsigned tty_rows; + unsigned tty_cols; + + bool ignore_sigpipe; + + ExecKeyringMode keyring_mode; + + /* Since resolving these names might involve socket + * connections and we don't want to deadlock ourselves these + * names are resolved on execution only and in the child + * process. */ + char *user; + char *group; + char **supplementary_groups; + + int set_login_environment; + + char *pam_name; + + char *utmp_id; + ExecUtmpMode utmp_mode; + + bool no_new_privileges; + + bool selinux_context_ignore; + bool apparmor_profile_ignore; + bool smack_process_label_ignore; + + char *selinux_context; + char *apparmor_profile; + char *smack_process_label; + + char **read_write_paths, **read_only_paths, **inaccessible_paths, **exec_paths, **no_exec_paths; + char **exec_search_path; + unsigned long mount_propagation_flag; + BindMount *bind_mounts; + size_t n_bind_mounts; + TemporaryFileSystem *temporary_filesystems; + size_t n_temporary_filesystems; + MountImage *mount_images; + size_t n_mount_images; + MountImage *extension_images; + size_t n_extension_images; + char **extension_directories; + + uint64_t capability_bounding_set; + uint64_t capability_ambient_set; + int secure_bits; + + int syslog_priority; + bool syslog_level_prefix; + char *syslog_identifier; + + struct iovec* log_extra_fields; + size_t n_log_extra_fields; + Set *log_filter_allowed_patterns; + Set *log_filter_denied_patterns; + + usec_t log_ratelimit_interval_usec; + unsigned log_ratelimit_burst; + + int log_level_max; + + char *log_namespace; + + ProtectProc protect_proc; /* hidepid= */ + ProcSubset proc_subset; /* subset= */ + + int private_mounts; + int memory_ksm; + bool private_tmp; + bool private_network; + bool private_devices; + bool private_users; + bool private_ipc; + bool protect_kernel_tunables; + bool protect_kernel_modules; + bool protect_kernel_logs; + bool protect_clock; + bool protect_control_groups; + ProtectSystem protect_system; + ProtectHome protect_home; + bool protect_hostname; + bool mount_apivfs; + + bool dynamic_user; + bool remove_ipc; + + bool memory_deny_write_execute; + bool restrict_realtime; + bool restrict_suid_sgid; + + bool lock_personality; + unsigned long personality; + + unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */ + + Set *restrict_filesystems; + bool restrict_filesystems_allow_list:1; + + Hashmap *syscall_filter; + Set *syscall_archs; + int syscall_errno; + bool syscall_allow_list:1; + + Hashmap *syscall_log; + bool syscall_log_allow_list:1; /* Log listed system calls */ + + bool address_families_allow_list:1; + Set *address_families; + + char *network_namespace_path; + char *ipc_namespace_path; + + ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX]; + ExecPreserveMode runtime_directory_preserve_mode; + usec_t timeout_clean_usec; + + Hashmap *set_credentials; /* output id → ExecSetCredential */ + Hashmap *load_credentials; /* output id → ExecLoadCredential */ + Set *import_credentials; + + ImagePolicy *root_image_policy, *mount_image_policy, *extension_image_policy; +}; + +static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { + assert(c); + + return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL; +} + +static inline bool exec_context_restrict_filesystems_set(const ExecContext *c) { + assert(c); + + return c->restrict_filesystems_allow_list || + !set_isempty(c->restrict_filesystems); +} + +static inline bool exec_context_with_rootfs(const ExecContext *c) { + assert(c); + + /* Checks if RootDirectory= or RootImage= are used */ + + return !empty_or_root(c->root_directory) || c->root_image; +} + +typedef enum ExecFlags { + EXEC_APPLY_SANDBOXING = 1 << 0, + EXEC_APPLY_CHROOT = 1 << 1, + EXEC_APPLY_TTY_STDIN = 1 << 2, + EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */ + EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */ + EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */ + EXEC_CGROUP_DELEGATE = 1 << 6, + EXEC_IS_CONTROL = 1 << 7, + EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */ + EXEC_WRITE_CREDENTIALS = 1 << 9, /* Set up the credential store logic */ + + /* The following are not used by execute.c, but by consumers internally */ + EXEC_PASS_FDS = 1 << 10, + EXEC_SETENV_RESULT = 1 << 11, + EXEC_SET_WATCHDOG = 1 << 12, + EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */ +} ExecFlags; + +/* Parameters for a specific invocation of a command. This structure is put together right before a command is + * executed. */ +struct ExecParameters { + RuntimeScope runtime_scope; + + char **environment; + + int *fds; + char **fd_names; + size_t n_socket_fds; + size_t n_storage_fds; + + ExecFlags flags; + bool selinux_context_net:1; + + CGroupMask cgroup_supported; + char *cgroup_path; + uint64_t cgroup_id; + + char **prefix; + char *received_credentials_directory; + char *received_encrypted_credentials_directory; + + char *confirm_spawn; + bool shall_confirm_spawn; + + usec_t watchdog_usec; + + int *idle_pipe; + + int stdin_fd; + int stdout_fd; + int stderr_fd; + + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + int exec_fd; + + char *notify_socket; + + LIST_HEAD(OpenFile, open_files); + + char *fallback_smack_process_label; + + char **files_env; + int user_lookup_fd; + int bpf_outer_map_fd; + + /* Used for logging in the executor functions */ + char *unit_id; + sd_id128_t invocation_id; + char invocation_id_string[SD_ID128_STRING_MAX]; +}; + +#define EXEC_PARAMETERS_INIT(_flags) \ + (ExecParameters) { \ + .flags = (_flags), \ + .stdin_fd = -EBADF, \ + .stdout_fd = -EBADF, \ + .stderr_fd = -EBADF, \ + .exec_fd = -EBADF, \ + .bpf_outer_map_fd = -EBADF, \ + .user_lookup_fd = -EBADF, \ + }; + +#include "unit.h" +#include "dynamic-user.h" + +int exec_spawn(Unit *unit, + ExecCommand *command, + const ExecContext *context, + ExecParameters *exec_params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + pid_t *ret); + +void exec_command_done(ExecCommand *c); +void exec_command_done_array(ExecCommand *c, size_t n); +ExecCommand* exec_command_free_list(ExecCommand *c); +void exec_command_free_array(ExecCommand **c, size_t n); +void exec_command_reset_status_array(ExecCommand *c, size_t n); +void exec_command_reset_status_list_array(ExecCommand **c, size_t n); +void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix); +void exec_command_append_list(ExecCommand **l, ExecCommand *e); +int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_; +int exec_command_append(ExecCommand *c, const char *path, ...) _sentinel_; + +void exec_context_init(ExecContext *c); +void exec_context_done(ExecContext *c); +void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix); + +int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root); +int exec_context_destroy_mount_ns_dir(Unit *u); + +const char* exec_context_fdname(const ExecContext *c, int fd_index); + +bool exec_context_may_touch_console(const ExecContext *c); +bool exec_context_maintains_privileges(const ExecContext *c); + +int exec_context_get_effective_ioprio(const ExecContext *c); +bool exec_context_get_effective_mount_apivfs(const ExecContext *c); + +void exec_context_free_log_extra_fields(ExecContext *c); + +void exec_context_revert_tty(ExecContext *c); + +int exec_context_get_clean_directories(ExecContext *c, char **prefix, ExecCleanMask mask, char ***ret); +int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret); + +const char *exec_context_tty_path(const ExecContext *context); +int exec_context_apply_tty_size(const ExecContext *context, int tty_fd, const char *tty_path); +void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p); + +uint64_t exec_context_get_rlimit(const ExecContext *c, const char *name); +int exec_context_get_oom_score_adjust(const ExecContext *c); +uint64_t exec_context_get_coredump_filter(const ExecContext *c); +int exec_context_get_nice(const ExecContext *c); +int exec_context_get_cpu_sched_policy(const ExecContext *c); +int exec_context_get_cpu_sched_priority(const ExecContext *c); +uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c); +char** exec_context_get_syscall_filter(const ExecContext *c); +char** exec_context_get_syscall_archs(const ExecContext *c); +char** exec_context_get_syscall_log(const ExecContext *c); +char** exec_context_get_address_families(const ExecContext *c); +char** exec_context_get_restrict_filesystems(const ExecContext *c); + +void exec_status_start(ExecStatus *s, pid_t pid); +void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status); +void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix); +void exec_status_reset(ExecStatus *s); + +int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecSharedRuntime **ret); +ExecSharedRuntime *exec_shared_runtime_destroy(ExecSharedRuntime *r); +ExecSharedRuntime *exec_shared_runtime_unref(ExecSharedRuntime *r); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_unref); + +int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds); +int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds); +int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); +void exec_shared_runtime_done(ExecSharedRuntime *rt); +void exec_shared_runtime_vacuum(Manager *m); + +int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret); +ExecRuntime* exec_runtime_free(ExecRuntime *rt); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free); +ExecRuntime* exec_runtime_destroy(ExecRuntime *rt); +void exec_runtime_clear(ExecRuntime *rt); + +int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret); +void exec_params_shallow_clear(ExecParameters *p); +void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix); +void exec_params_deep_clear(ExecParameters *p); + +bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c); + +void exec_directory_done(ExecDirectory *d); +int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink); +void exec_directory_sort(ExecDirectory *d); +bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type); + +ExecCleanMask exec_clean_mask_from_string(const char *s); + +const char* exec_output_to_string(ExecOutput i) _const_; +ExecOutput exec_output_from_string(const char *s) _pure_; + +const char* exec_input_to_string(ExecInput i) _const_; +ExecInput exec_input_from_string(const char *s) _pure_; + +const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_; +ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_; + +const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_; +ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_; + +const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_; +ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_; + +const char* exec_directory_type_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_; + +const char* exec_directory_type_symlink_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_directory_type_symlink_from_string(const char *s) _pure_; + +const char* exec_directory_type_mode_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_directory_type_mode_from_string(const char *s) _pure_; + +const char* exec_resource_type_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_; + +bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime); +bool exec_needs_network_namespace(const ExecContext *context); +bool exec_needs_ipc_namespace(const ExecContext *context); + +/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters + * instead of the unit object, so that it can be used in the sd-executor context (where the unit object is + * not available). */ + +#define LOG_EXEC_ID_FIELD(ep) \ + ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_UNIT=" : "UNIT=") +#define LOG_EXEC_ID_FIELD_FORMAT(ep) \ + ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_UNIT=%s" : "UNIT=%s") +#define LOG_EXEC_INVOCATION_ID_FIELD(ep) \ + ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=" : "INVOCATION_ID=") +#define LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep) \ + ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=%s" : "INVOCATION_ID=%s") + +#define log_exec_full_errno_zerook(ec, ep, level, error, ...) \ + ({ \ + const ExecContext *_c = (ec); \ + const ExecParameters *_p = (ep); \ + const int _l = (level); \ + bool _do_log = !(log_get_max_level() < LOG_PRI(_l) || \ + !(_c->log_level_max < 0 || \ + _c->log_level_max >= LOG_PRI(_l))); \ + LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ + _c->n_log_extra_fields); \ + !_do_log ? -ERRNO_VALUE(error) : \ + log_object_internal(_l, error, PROJECT_FILE, \ + __LINE__, __func__, \ + LOG_EXEC_ID_FIELD(_p), \ + _p->unit_id, \ + LOG_EXEC_INVOCATION_ID_FIELD(_p), \ + _p->invocation_id_string, ##__VA_ARGS__); \ + }) + +#define log_exec_full_errno(ec, ep, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_exec_full_errno_zerook(ec, ep, level, _error, ##__VA_ARGS__); \ + }) + +#define log_exec_full(ec, ep, level, ...) (void) log_exec_full_errno_zerook(ec, ep, level, 0, __VA_ARGS__) + +#define log_exec_debug(ec, ep, ...) log_exec_full(ec, ep, LOG_DEBUG, __VA_ARGS__) +#define log_exec_info(ec, ep, ...) log_exec_full(ec, ep, LOG_INFO, __VA_ARGS__) +#define log_exec_notice(ec, ep, ...) log_exec_full(ec, ep, LOG_NOTICE, __VA_ARGS__) +#define log_exec_warning(ec, ep, ...) log_exec_full(ec, ep, LOG_WARNING, __VA_ARGS__) +#define log_exec_error(ec, ep, ...) log_exec_full(ec, ep, LOG_ERR, __VA_ARGS__) + +#define log_exec_debug_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_DEBUG, error, __VA_ARGS__) +#define log_exec_info_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_INFO, error, __VA_ARGS__) +#define log_exec_notice_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_NOTICE, error, __VA_ARGS__) +#define log_exec_warning_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_WARNING, error, __VA_ARGS__) +#define log_exec_error_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_ERR, error, __VA_ARGS__) + +#define log_exec_struct_errno(ec, ep, level, error, ...) \ + ({ \ + const ExecContext *_c = (ec); \ + const ExecParameters *_p = (ep); \ + const int _l = (level); \ + bool _do_log = !(_c->log_level_max < 0 || \ + _c->log_level_max >= LOG_PRI(_l)); \ + LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ + _c->n_log_extra_fields); \ + _do_log ? \ + log_struct_errno(_l, error, __VA_ARGS__, LOG_EXEC_ID_FIELD_FORMAT(_p), _p->unit_id) : \ + -ERRNO_VALUE(error); \ + }) + +#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__) + +#define log_exec_struct_iovec_errno(ec, ep, level, error, iovec, n_iovec) \ + ({ \ + const ExecContext *_c = (ec); \ + const ExecParameters *_p = (ep); \ + const int _l = (level); \ + bool _do_log = !(_c->log_level_max < 0 || \ + _c->log_level_max >= LOG_PRI(_l)); \ + LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ + _c->n_log_extra_fields); \ + _do_log ? \ + log_struct_iovec_errno(_l, error, iovec, n_iovec) : \ + -ERRNO_VALUE(error); \ + }) + +#define log_exec_struct_iovec(ec, ep, level, iovec, n_iovec) log_exec_struct_iovec_errno(ec, ep, level, 0, iovec, n_iovec) + +/* Like LOG_MESSAGE(), but with the unit name prefixed. */ +#define LOG_EXEC_MESSAGE(ep, fmt, ...) LOG_MESSAGE("%s: " fmt, (ep)->unit_id, ##__VA_ARGS__) +#define LOG_EXEC_ID(ep) LOG_EXEC_ID_FIELD_FORMAT(ep), (ep)->unit_id +#define LOG_EXEC_INVOCATION_ID(ep) LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep), (ep)->invocation_id_string + +#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c) \ + const ExecContext *c = (ec); \ + const ExecParameters *p = (ep); \ + LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_ID_FIELD(p), p->unit_id); \ + LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \ + LOG_CONTEXT_PUSH_IOV(c->log_extra_fields, c->n_log_extra_fields) + +#define LOG_CONTEXT_PUSH_EXEC(ec, ep) \ + _LOG_CONTEXT_PUSH_EXEC(ec, ep, UNIQ_T(p, UNIQ), UNIQ_T(c, UNIQ)) diff --git a/src/core/executor.c b/src/core/executor.c new file mode 100644 index 0000000..b2716ef --- /dev/null +++ b/src/core/executor.c @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "argv-util.h" +#include "build.h" +#include "exec-invoke.h" +#include "execute-serialize.h" +#include "execute.h" +#include "exit-status.h" +#include "fdset.h" +#include "fd-util.h" +#include "fileio.h" +#include "getopt-defs.h" +#include "label-util.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "selinux-util.h" +#include "static-destruct.h" + +static FILE *arg_serialization = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_serialization, fclosep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "%sSandbox and execute processes.%s\n\n" + " -h --help Show this help and exit\n" + " --version Print version string and exit\n" + " --log-target=TARGET Set log target (console, journal,\n" + " journal-or-kmsg,\n" + " kmsg, null)\n" + " --log-level=LEVEL Set log level (debug, info, notice,\n" + " warning, err, crit,\n" + " alert, emerg)\n" + " --log-color=BOOL Highlight important messages\n" + " --log-location=BOOL Include code location in messages\n" + " --log-time=BOOL Prefix messages with current time\n" + " --deserialize=FD Deserialize process config from FD\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + COMMON_GETOPT_ARGS, + ARG_VERSION, + ARG_DESERIALIZE, + }; + + static const struct option options[] = { + { "log-level", required_argument, NULL, ARG_LOG_LEVEL }, + { "log-target", required_argument, NULL, ARG_LOG_TARGET }, + { "log-color", required_argument, NULL, ARG_LOG_COLOR }, + { "log-location", required_argument, NULL, ARG_LOG_LOCATION }, + { "log-time", required_argument, NULL, ARG_LOG_TIME }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "deserialize", required_argument, NULL, ARG_DESERIALIZE }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg); + + break; + + case ARG_LOG_COLOR: + r = log_show_color_from_string(optarg); + if (r < 0) + return log_error_errno( + r, + "Failed to parse log color setting \"%s\": %m", + optarg); + + break; + + case ARG_LOG_LOCATION: + r = log_show_location_from_string(optarg); + if (r < 0) + return log_error_errno( + r, + "Failed to parse log location setting \"%s\": %m", + optarg); + + break; + + case ARG_LOG_TIME: + r = log_show_time_from_string(optarg); + if (r < 0) + return log_error_errno( + r, + "Failed to parse log time setting \"%s\": %m", + optarg); + + break; + + case ARG_DESERIALIZE: { + _cleanup_close_ int fd = -EBADF; + FILE *f; + + fd = parse_fd(optarg); + if (fd < 0) + return log_error_errno(fd, + "Failed to parse serialization fd \"%s\": %m", + optarg); + + r = fd_cloexec(fd, /* cloexec= */ true); + if (r < 0) + return log_error_errno(r, + "Failed to set serialization fd %d to close-on-exec: %m", + fd); + + f = take_fdopen(&fd, "r"); + if (!f) + return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd); + + safe_fclose(arg_serialization); + arg_serialization = f; + + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (!arg_serialization) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No serialization fd specified."); + + return 1 /* work to do */; +} + +static int run(int argc, char *argv[]) { + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {}; + _cleanup_(exec_context_done) ExecContext context = {}; + _cleanup_(exec_command_done) ExecCommand command = {}; + _cleanup_(exec_params_deep_clear) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0); + _cleanup_(exec_shared_runtime_done) ExecSharedRuntime shared = { + .netns_storage_socket = EBADF_PAIR, + .ipcns_storage_socket = EBADF_PAIR, + }; + _cleanup_(dynamic_creds_done) DynamicCreds dynamic_creds = {}; + _cleanup_(exec_runtime_clear) ExecRuntime runtime = { + .ephemeral_storage_socket = EBADF_PAIR, + .shared = &shared, + .dynamic_creds = &dynamic_creds, + }; + int exit_status = EXIT_SUCCESS, r; + + exec_context_init(&context); + cgroup_context_init(&cgroup_context); + + /* We might be starting the journal itself, we'll be told by the caller what to do */ + log_set_always_reopen_console(true); + log_set_prohibit_ipc(true); + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + /* Now that we know the intended log target, allow IPC and open the final log target. */ + log_set_prohibit_ipc(false); + log_open(); + + /* This call would collect all passed fds and enable CLOEXEC. We'll unset it in exec_invoke (flag_fds) + * for fds that shall be passed to the child. + * The serialization fd is set to CLOEXEC in parse_argv, so it's also filtered. */ + r = fdset_new_fill(/* filter_cloexec= */ 0, &fdset); + if (r < 0) + return log_error_errno(r, "Failed to create fd set: %m"); + + /* Initialize lazily. SMACK is just a few operations, but the SELinux is very slow as it requires + * loading the entire database in memory, so we will do it lazily only if it is actually needed, to + * avoid wasting 2ms-10ms for each sd-executor that gets spawned. */ + r = mac_init_lazy(); + if (r < 0) + return log_error_errno(r, "Failed to initialize MAC layer: %m"); + + r = exec_deserialize_invocation(arg_serialization, + fdset, + &context, + &command, + ¶ms, + &runtime, + &cgroup_context); + if (r < 0) + return log_error_errno(r, "Failed to deserialize: %m"); + + arg_serialization = safe_fclose(arg_serialization); + fdset = fdset_free(fdset); + + r = exec_invoke(&command, + &context, + ¶ms, + &runtime, + &cgroup_context, + &exit_status); + if (r < 0) { + const char *status = ASSERT_PTR( + exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD)); + + log_exec_struct_errno(&context, ¶ms, LOG_ERR, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_EXEC_INVOCATION_ID(¶ms), + LOG_EXEC_MESSAGE(¶ms, "Failed at step %s spawning %s: %m", + status, command.path), + "EXECUTABLE=%s", command.path); + } else + assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */ + + return exit_status; +} + +int main(int argc, char *argv[]) { + int r; + + /* We use safe_fork() for spawning sd-pam helper process, which internally calls rename_process(). + * As the last step of renaming, all saved argvs are memzero()-ed. Hence, we need to save the argv + * first to prevent showing "intense" cmdline. See #30352. */ + save_argc_argv(argc, argv); + + r = run(argc, argv); + + mac_selinux_finish(); + static_destruct(); + + return r < 0 ? EXIT_FAILURE : r; +} diff --git a/src/core/fuzz-execute-serialize.c b/src/core/fuzz-execute-serialize.c new file mode 100644 index 0000000..6069efd --- /dev/null +++ b/src/core/fuzz-execute-serialize.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* Notes on how to run the fuzzer manually: + * 1) Build the fuzzers with LLVM's libFuzzer and ASan+UBSan: + * $ CC=clang CXX=clang++ meson build-libfuzz -Db_sanitize=address,undefined -Dllvm-fuzz=true -Db_lundef=false + * + * 2) Collect some valid inputs: + * + * OUT=test/fuzz/fuzz-execute-serialize/initial + * for section in context command parameters runtime cgroup; do + * awk "match(\$0, /startswith\\(.+, \"(exec-${section}-[^\"]+=)\"/, m) { print m[1]; }" \ + * src/core/execute-serialize.c >>"$OUT" + * # Each "section" is delimited by an empty line + * echo >>"$OUT" + * done + * + * 3) Run the fuzzer: + * $ build-libfuzz/fuzz-execute-serialize test/fuzz/fuzz-execute-serialize + */ + +#include + +#include "alloc-util.h" +#include "execute-serialize.h" +#include "fd-util.h" +#include "fuzz.h" +#include "service.h" + +static void exec_fuzz_one(FILE *f, FDSet *fdset) { + _cleanup_(exec_params_deep_clear) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0); + _cleanup_(exec_context_done) ExecContext exec_context = {}; + _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {}; + DynamicCreds dynamic_creds = {}; + ExecCommand command = {}; + ExecSharedRuntime shared = { + .netns_storage_socket = EBADF_PAIR, + .ipcns_storage_socket = EBADF_PAIR, + }; + ExecRuntime runtime = { + .ephemeral_storage_socket = EBADF_PAIR, + .shared = &shared, + .dynamic_creds = &dynamic_creds, + }; + + exec_context_init(&exec_context); + cgroup_context_init(&cgroup_context); + + (void) exec_deserialize_invocation(f, fdset, &exec_context, &command, ¶ms, &runtime, &cgroup_context); + (void) exec_serialize_invocation(f, fdset, &exec_context, &command, ¶ms, &runtime, &cgroup_context); + (void) exec_deserialize_invocation(f, fdset, &exec_context, &command, ¶ms, &runtime, &cgroup_context); + + /* We definitely didn't provide valid FDs during deserialization, so + * wipe the FDs before exec_params_serialized_clear() kicks in, otherwise + * we'll hit the assert in safe_close() */ + params.stdin_fd = -EBADF; + params.stdout_fd = -EBADF; + params.stderr_fd = -EBADF; + params.exec_fd = -EBADF; + params.user_lookup_fd = -EBADF; + params.bpf_outer_map_fd = -EBADF; + if (!params.fds) + params.n_socket_fds = params.n_storage_fds = 0; + for (size_t i = 0; params.fds && i < params.n_socket_fds + params.n_storage_fds; i++) + params.fds[i] = -EBADF; + + exec_command_done_array(&command, /* n= */ 1); + exec_shared_runtime_done(&shared); + if (dynamic_creds.group != dynamic_creds.user) + dynamic_user_free(dynamic_creds.group); + dynamic_user_free(dynamic_creds.user); + free(runtime.ephemeral_copy); + safe_close_pair(runtime.ephemeral_storage_socket); +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_fdset_free_ FDSet *fdset = NULL; + + if (outside_size_range(size, 0, 128 * 1024)) + return 0; + + fuzz_setup_logging(); + + assert_se(fdset = fdset_new()); + assert_se(f = data_to_file(data, size)); + + exec_fuzz_one(f, fdset); + + return 0; +} diff --git a/src/core/fuzz-manager-serialize.c b/src/core/fuzz-manager-serialize.c new file mode 100644 index 0000000..57083ca --- /dev/null +++ b/src/core/fuzz-manager-serialize.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fuzz.h" +#include "manager-serialize.h" +#include "manager.h" +#include "service.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_fclose_ FILE *f = NULL, *null = NULL; + _cleanup_fdset_free_ FDSet *fdset = NULL; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0); + /* Set log overrides as well to make it harder for a serialization file + * to switch log levels/targets during fuzzing */ + manager_override_log_level(m, log_get_max_level()); + manager_override_log_target(m, log_get_target()); + assert_se(null = fopen("/dev/null", "we")); + assert_se(fdset = fdset_new()); + assert_se(f = data_to_file(data, size)); + + (void) manager_deserialize(m, f, fdset); + (void) manager_serialize(m, null, fdset, true); + (void) manager_serialize(m, null, fdset, false); + + return 0; +} diff --git a/src/core/fuzz-manager-serialize.options b/src/core/fuzz-manager-serialize.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/core/fuzz-manager-serialize.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/core/fuzz-unit-file.c b/src/core/fuzz-unit-file.c new file mode 100644 index 0000000..57480cf --- /dev/null +++ b/src/core/fuzz-unit-file.c @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "fd-util.h" +#include "fuzz.h" +#include "install.h" +#include "load-fragment.h" +#include "manager-dump.h" +#include "memstream-util.h" +#include "string-util.h" +#include "unit-serialize.h" +#include "utf8.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + UnitType t; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *u; + const char *name; + long offset; + + if (outside_size_range(size, 0, 65536)) + return 0; + + f = data_to_file(data, size); + + assert_se(f); + + if (read_line(f, LINE_MAX, &p) < 0) + return 0; + + t = unit_type_from_string(p); + if (t < 0) + return 0; + + if (!unit_vtable[t]->load) + return 0; + + offset = ftell(f); + assert_se(offset >= 0); + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *ll; + + if (read_line(f, LONG_LINE_MAX, &l) <= 0) + break; + + ll = startswith(l, UTF8_BYTE_ORDER_MARK) ?: l; + ll = ll + strspn(ll, WHITESPACE); + + if (HAS_FEATURE_MEMORY_SANITIZER && startswith(ll, "ListenNetlink")) { + /* ListenNetlink causes a false positive in msan, + * let's skip this for now. */ + log_notice("Skipping test because ListenNetlink= is present"); + return 0; + } + } + + assert_se(fseek(f, offset, SEEK_SET) == 0); + + fuzz_setup_logging(); + + assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0); + + name = strjoina("a.", unit_type_to_string(t)); + assert_se(unit_new_for_name(m, unit_vtable[t]->object_size, name, &u) >= 0); + + (void) config_parse( + name, name, f, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + 0, + u, + NULL); + + _cleanup_(memstream_done) MemStream ms = {}; + FILE *g; + + assert_se(g = memstream_init(&ms)); + unit_dump(u, g, ""); + manager_dump(m, g, /* patterns= */ NULL, ">>>"); + + return 0; +} diff --git a/src/core/fuzz-unit-file.options b/src/core/fuzz-unit-file.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/core/fuzz-unit-file.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c new file mode 100644 index 0000000..00d6ad6 --- /dev/null +++ b/src/core/generator-setup.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "generator-setup.h" +#include "macro.h" +#include "mkdir-label.h" +#include "rm-rf.h" + +int lookup_paths_mkdir_generator(LookupPaths *p) { + int r, q; + + assert(p); + + if (!p->generator || !p->generator_early || !p->generator_late) + return -EINVAL; + + r = mkdir_p_label(p->generator, 0755); + + q = mkdir_p_label(p->generator_early, 0755); + if (q < 0 && r >= 0) + r = q; + + q = mkdir_p_label(p->generator_late, 0755); + if (q < 0 && r >= 0) + r = q; + + return r; +} + +void lookup_paths_trim_generator(LookupPaths *p) { + assert(p); + + /* Trim empty dirs */ + + if (p->generator) + (void) rmdir(p->generator); + if (p->generator_early) + (void) rmdir(p->generator_early); + if (p->generator_late) + (void) rmdir(p->generator_late); +} + +void lookup_paths_flush_generator(LookupPaths *p) { + assert(p); + + /* Flush the generated unit files in full */ + + if (p->generator) + (void) rm_rf(p->generator, REMOVE_ROOT|REMOVE_PHYSICAL); + if (p->generator_early) + (void) rm_rf(p->generator_early, REMOVE_ROOT|REMOVE_PHYSICAL); + if (p->generator_late) + (void) rm_rf(p->generator_late, REMOVE_ROOT|REMOVE_PHYSICAL); + + if (p->temporary_dir) + (void) rm_rf(p->temporary_dir, REMOVE_ROOT|REMOVE_PHYSICAL); +} diff --git a/src/core/generator-setup.h b/src/core/generator-setup.h new file mode 100644 index 0000000..1cc816b --- /dev/null +++ b/src/core/generator-setup.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "path-lookup.h" + +int lookup_paths_mkdir_generator(LookupPaths *p); +void lookup_paths_trim_generator(LookupPaths *p); +void lookup_paths_flush_generator(LookupPaths *p); diff --git a/src/core/ima-setup.c b/src/core/ima-setup.c new file mode 100644 index 0000000..37916bb --- /dev/null +++ b/src/core/ima-setup.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy + TORSEC group — http://security.polito.it +***/ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "ima-setup.h" +#include "log.h" + +#define IMA_SECFS_DIR "/sys/kernel/security/ima" +#define IMA_SECFS_POLICY IMA_SECFS_DIR "/policy" +#define IMA_POLICY_PATH "/etc/ima/ima-policy" + +int ima_setup(void) { +#if ENABLE_IMA + _cleanup_fclose_ FILE *input = NULL; + _cleanup_close_ int imafd = -EBADF; + unsigned lineno = 0; + int r; + + if (access(IMA_SECFS_DIR, F_OK) < 0) { + log_debug_errno(errno, "IMA support is disabled in the kernel, ignoring: %m"); + return 0; + } + + if (access(IMA_SECFS_POLICY, W_OK) < 0) { + log_warning_errno(errno, "Another IMA custom policy has already been loaded, ignoring: %m"); + return 0; + } + + if (access(IMA_POLICY_PATH, F_OK) < 0) { + log_debug_errno(errno, "No IMA custom policy file "IMA_POLICY_PATH", ignoring: %m"); + return 0; + } + + imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC); + if (imafd < 0) { + log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m"); + return 0; + } + + /* attempt to write the name of the policy file into sysfs file */ + if (write(imafd, IMA_POLICY_PATH, STRLEN(IMA_POLICY_PATH)) > 0) + goto done; + + /* fall back to copying the policy line-by-line */ + input = fopen(IMA_POLICY_PATH, "re"); + if (!input) { + log_warning_errno(errno, "Failed to open the IMA custom policy file "IMA_POLICY_PATH", ignoring: %m"); + return 0; + } + + safe_close(imafd); + + imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC); + if (imafd < 0) { + log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m"); + return 0; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + size_t len; + + r = read_line(input, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read the IMA custom policy file "IMA_POLICY_PATH": %m"); + if (r == 0) + break; + + len = strlen(line); + lineno++; + + if (len > 0 && write(imafd, line, len) < 0) + return log_error_errno(errno, "Failed to load the IMA custom policy file "IMA_POLICY_PATH"%u: %m", + lineno); + } + +done: + log_info("Successfully loaded the IMA custom policy "IMA_POLICY_PATH"."); +#endif /* ENABLE_IMA */ + return 0; +} diff --git a/src/core/ima-setup.h b/src/core/ima-setup.h new file mode 100644 index 0000000..f964c7b --- /dev/null +++ b/src/core/ima-setup.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy + TORSEC group — http://security.polito.it +***/ + +int ima_setup(void); diff --git a/src/core/import-creds.c b/src/core/import-creds.c new file mode 100644 index 0000000..48f3160 --- /dev/null +++ b/src/core/import-creds.c @@ -0,0 +1,938 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "confidential-virt.h" +#include "copy.h" +#include "creds-util.h" +#include "escape.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "import-creds.h" +#include "initrd-util.h" +#include "io-util.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "recurse-dir.h" +#include "strv.h" +#include "virt.h" + +/* This imports credentials passed in from environments higher up (VM manager, boot loader, …) and rearranges + * them so that later code can access them using our regular credential protocol + * (i.e. $CREDENTIALS_DIRECTORY). It's supposed to be minimal glue to unify behaviour how PID 1 (and + * generators invoked by it) can acquire credentials from outside, to mimic how we support it for containers, + * but on VM/physical environments. + * + * This does four things: + * + * 1. It imports credentials picked up by sd-boot (and placed in the /.extra/credentials/ dir in the initrd) + * and puts them in /run/credentials/@encrypted/. Note that during the initrd→host transition the initrd root + * file system is cleaned out, thus it is essential we pick up these files before they are deleted. Note + * that these credentials originate from an untrusted source, i.e. the ESP and are not + * pre-authenticated. They still have to be authenticated before use. + * + * 2. It imports credentials from /proc/cmdline and puts them in /run/credentials/@system/. These come from a + * trusted environment (i.e. the boot loader), and are typically authenticated (if authentication is done + * at all). However, they are world-readable, which might be less than ideal. Hence only use this for data + * that doesn't require trust. + * + * 3. It imports credentials passed in through qemu's fw_cfg logic. Specifically, credential data passed in + * /sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials/ is picked up and also placed in + * /run/credentials/@system/. + * + * 4. It imports credentials passed in via the DMI/SMBIOS OEM string tables, quite similar to fw_cfg. It + * looks for strings starting with "io.systemd.credential:" and "io.systemd.credential.binary:". Both + * expect a key=value assignment, but in the latter case the value is Base64 decoded, allowing binary + * credentials to be passed in. + * + * If it picked up any credentials it will set the $CREDENTIALS_DIRECTORY and + * $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables to point to these directories, so that processes + * can find them there later on. If "ramfs" is available $CREDENTIALS_DIRECTORY will be backed by it (but + * $ENCRYPTED_CREDENTIALS_DIRECTORY is just a regular tmpfs). + * + * Net result: the service manager can pick up trusted credentials from $CREDENTIALS_DIRECTORY afterwards, + * and untrusted ones from $ENCRYPTED_CREDENTIALS_DIRECTORY. */ + +typedef struct ImportCredentialContext { + int target_dir_fd; + size_t size_sum; + unsigned n_credentials; +} ImportCredentialContext; + +static void import_credentials_context_free(ImportCredentialContext *c) { + assert(c); + + c->target_dir_fd = safe_close(c->target_dir_fd); +} + +static int acquire_credential_directory(ImportCredentialContext *c, const char *path, bool with_mount) { + int r; + + assert(c); + assert(path); + + if (c->target_dir_fd >= 0) + return c->target_dir_fd; + + r = path_is_mount_point(path, NULL, 0); + if (r < 0) { + if (r != -ENOENT) + return log_error_errno(r, "Failed to determine if %s is a mount point: %m", path); + + r = mkdir_safe_label(path, 0700, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + return log_error_errno(r, "Failed to create %s mount point: %m", path); + + r = 0; /* Now it exists and is not a mount point */ + } + if (r > 0) + /* If already a mount point, then remount writable */ + (void) mount_nofollow_verbose(LOG_WARNING, NULL, path, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL); + else if (with_mount) + /* If not a mount point yet, and the credentials are not encrypted, then let's try to mount a no-swap fs there */ + (void) mount_credentials_fs(path, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false); + + c->target_dir_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (c->target_dir_fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", path); + + return c->target_dir_fd; +} + +static int open_credential_file_for_write(int target_dir_fd, const char *dir_name, const char *n) { + int fd; + + assert(target_dir_fd >= 0); + assert(dir_name); + assert(n); + + fd = openat(target_dir_fd, n, O_WRONLY|O_CLOEXEC|O_CREAT|O_EXCL|O_NOFOLLOW, 0400); + if (fd < 0) { + if (errno == EEXIST) /* In case of EEXIST we'll only debug log! */ + return log_debug_errno(errno, "Credential '%s' set twice, ignoring.", n); + + return log_error_errno(errno, "Failed to create %s/%s: %m", dir_name, n); + } + + return fd; +} + +static bool credential_size_ok(ImportCredentialContext *c, const char *name, uint64_t size) { + assert(c); + assert(name); + + if (size > CREDENTIAL_SIZE_MAX) { + log_warning("Credential '%s' is larger than allowed limit (%s > %s), skipping.", name, FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIAL_SIZE_MAX)); + return false; + } + + if (size > CREDENTIALS_TOTAL_SIZE_MAX - c->size_sum) { + log_warning("Accumulated credential size would be above allowed limit (%s+%s > %s), skipping '%s'.", + FORMAT_BYTES(c->size_sum), FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIALS_TOTAL_SIZE_MAX), name); + return false; + } + + return true; +} + +static int finalize_credentials_dir(const char *dir, const char *envvar) { + int r; + + assert(dir); + assert(envvar); + + /* Try to make the credentials directory read-only now */ + + r = make_mount_point(dir); + if (r < 0) + log_warning_errno(r, "Failed to make '%s' a mount point, ignoring: %m", dir); + else + (void) mount_nofollow_verbose(LOG_WARNING, NULL, dir, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL); + + if (setenv(envvar, dir, /* overwrite= */ true) < 0) + return log_error_errno(errno, "Failed to set $%s environment variable: %m", envvar); + + return 0; +} + +static int import_credentials_boot(void) { + _cleanup_(import_credentials_context_free) ImportCredentialContext context = { + .target_dir_fd = -EBADF, + }; + int r; + + /* systemd-stub will wrap sidecar *.cred files from the UEFI kernel image directory into initrd + * cpios, so that they unpack into /.extra/. We'll pick them up from there and copy them into /run/ + * so that we can access them during the entire runtime (note that the initrd file system is erased + * during the initrd → host transition). Note that these credentials originate from an untrusted + * source (i.e. the ESP typically) and thus need to be authenticated later. We thus put them in a + * directory separate from the usual credentials which are from a trusted source. */ + + if (!in_initrd()) + return 0; + + FOREACH_STRING(p, + "/.extra/credentials/", /* specific to this boot menu */ + "/.extra/global_credentials/") { /* boot partition wide */ + + _cleanup_free_ DirectoryEntries *de = NULL; + _cleanup_close_ int source_dir_fd = -EBADF; + + source_dir_fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + if (source_dir_fd < 0) { + if (errno == ENOENT) { + log_debug("No credentials passed via %s.", p); + continue; + } + + log_warning_errno(errno, "Failed to open '%s', ignoring: %m", p); + continue; + } + + r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) { + log_warning_errno(r, "Failed to read '%s' contents, ignoring: %m", p); + continue; + } + + for (size_t i = 0; i < de->n_entries; i++) { + const struct dirent *d = de->entries[i]; + _cleanup_close_ int cfd = -EBADF, nfd = -EBADF; + _cleanup_free_ char *n = NULL; + const char *e; + struct stat st; + + e = endswith(d->d_name, ".cred"); + if (!e) + continue; + + /* drop .cred suffix (which we want in the ESP sidecar dir, but not for our internal + * processing) */ + n = strndup(d->d_name, e - d->d_name); + if (!n) + return log_oom(); + + if (!credential_name_valid(n)) { + log_warning("Credential '%s' has invalid name, ignoring.", d->d_name); + continue; + } + + cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC); + if (cfd < 0) { + log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name); + continue; + } + + if (fstat(cfd, &st) < 0) { + log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name); + continue; + } + + r = stat_verify_regular(&st); + if (r < 0) { + log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name); + continue; + } + + if (!credential_size_ok(&context, n, st.st_size)) + continue; + + r = acquire_credential_directory(&context, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ false); + if (r < 0) + return r; + + nfd = open_credential_file_for_write(context.target_dir_fd, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, n); + if (nfd == -EEXIST) + continue; + if (nfd < 0) + return nfd; + + r = copy_bytes(cfd, nfd, st.st_size, 0); + if (r < 0) { + (void) unlinkat(context.target_dir_fd, n, 0); + return log_error_errno(r, "Failed to create credential '%s': %m", n); + } + + context.size_sum += st.st_size; + context.n_credentials++; + + log_debug("Successfully copied boot credential '%s'.", n); + } + } + + if (context.n_credentials > 0) { + log_debug("Imported %u credentials from boot loader.", context.n_credentials); + + r = finalize_credentials_dir(ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, "ENCRYPTED_CREDENTIALS_DIRECTORY"); + if (r < 0) + return r; + } + + return 0; +} + +static int proc_cmdline_callback(const char *key, const char *value, void *data) { + ImportCredentialContext *c = ASSERT_PTR(data); + _cleanup_free_ void *binary = NULL; + _cleanup_free_ char *n = NULL; + _cleanup_close_ int nfd = -EBADF; + const char *colon, *d; + bool base64; + size_t l; + int r; + + assert(key); + + if (proc_cmdline_key_streq(key, "systemd.set_credential")) + base64 = false; + else if (proc_cmdline_key_streq(key, "systemd.set_credential_binary")) + base64 = true; + else + return 0; + + colon = value ? strchr(value, ':') : NULL; + if (!colon) { + log_warning("Credential assignment through kernel command line lacks ':' character, ignoring: %s", value); + return 0; + } + + n = strndup(value, colon - value); + if (!n) + return log_oom(); + + if (!credential_name_valid(n)) { + log_warning("Credential name '%s' is invalid, ignoring.", n); + return 0; + } + + colon++; + + if (base64) { + r = unbase64mem(colon, SIZE_MAX, &binary, &l); + if (r < 0) { + log_warning_errno(r, "Failed to decode binary credential '%s' data, ignoring: %m", n); + return 0; + } + + d = binary; + } else { + d = colon; + l = strlen(colon); + } + + if (!credential_size_ok(c, n, l)) + return 0; + + r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true); + if (r < 0) + return r; + + nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, n); + if (nfd == -EEXIST) + return 0; + if (nfd < 0) + return nfd; + + r = loop_write(nfd, d, l); + if (r < 0) { + (void) unlinkat(c->target_dir_fd, n, 0); + return log_error_errno(r, "Failed to write credential: %m"); + } + + c->size_sum += l; + c->n_credentials++; + + log_debug("Successfully processed kernel command line credential '%s'.", n); + + return 0; +} + +static int import_credentials_proc_cmdline(ImportCredentialContext *c) { + int r; + + assert(c); + + r = proc_cmdline_parse(proc_cmdline_callback, c, 0); + if (r < 0) + return log_error_errno(r, "Failed to parse /proc/cmdline: %m"); + + return 0; +} + +#define QEMU_FWCFG_PATH "/sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials" + +static int import_credentials_qemu(ImportCredentialContext *c) { + _cleanup_free_ DirectoryEntries *de = NULL; + _cleanup_close_ int source_dir_fd = -EBADF; + int r; + + assert(c); + + if (detect_container() > 0) /* don't access /sys/ in a container */ + return 0; + + if (detect_confidential_virtualization() > 0) /* don't trust firmware if confidential VMs */ + return 0; + + source_dir_fd = open(QEMU_FWCFG_PATH, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (source_dir_fd < 0) { + if (errno == ENOENT) { + log_debug("No credentials passed via fw_cfg."); + return 0; + } + + log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "', ignoring: %m"); + return 0; + } + + r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) { + log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "' contents, ignoring: %m"); + return 0; + } + + for (size_t i = 0; i < de->n_entries; i++) { + const struct dirent *d = de->entries[i]; + _cleanup_close_ int vfd = -EBADF, rfd = -EBADF, nfd = -EBADF; + _cleanup_free_ char *szs = NULL; + uint64_t sz; + + if (!credential_name_valid(d->d_name)) { + log_warning("Credential '%s' has invalid name, ignoring.", d->d_name); + continue; + } + + vfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (vfd < 0) { + log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/, ignoring: %m", d->d_name); + continue; + } + + r = read_virtual_file_at(vfd, "size", LINE_MAX, &szs, NULL); + if (r < 0) { + log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "'/%s/size, ignoring: %m", d->d_name); + continue; + } + + r = safe_atou64(strstrip(szs), &sz); + if (r < 0) { + log_warning_errno(r, "Failed to parse size of credential '%s', ignoring: %s", d->d_name, szs); + continue; + } + + if (!credential_size_ok(c, d->d_name, sz)) + continue; + + /* Ideally we'd just symlink the data here. Alas the kernel driver exports the raw file as + * having size zero, and we'd rather not have applications support such credential + * files. Let's hence copy the files to make them regular. */ + + rfd = openat(vfd, "raw", O_RDONLY|O_CLOEXEC); + if (rfd < 0) { + log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/raw, ignoring: %m", d->d_name); + continue; + } + + r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true); + if (r < 0) + return r; + + nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name); + if (nfd == -EEXIST) + continue; + if (nfd < 0) + return nfd; + + r = copy_bytes(rfd, nfd, sz, 0); + if (r < 0) { + (void) unlinkat(c->target_dir_fd, d->d_name, 0); + return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name); + } + + c->size_sum += sz; + c->n_credentials++; + + log_debug("Successfully copied qemu fw_cfg credential '%s'.", d->d_name); + } + + return 0; +} + +static int parse_smbios_strings(ImportCredentialContext *c, const char *data, size_t size) { + size_t left, skip; + const char *p; + int r; + + assert(c); + assert(data || size == 0); + + /* Unpacks a packed series of SMBIOS OEM vendor strings. These are a series of NUL terminated + * strings, one after the other. */ + + for (p = data, left = size; left > 0; p += skip, left -= skip) { + _cleanup_free_ void *buf = NULL; + _cleanup_free_ char *cn = NULL; + _cleanup_close_ int nfd = -EBADF; + const char *nul, *n, *eq; + const void *cdata; + size_t buflen, cdata_len; + bool unbase64; + + nul = memchr(p, 0, left); + if (nul) + skip = (nul - p) + 1; + else { + nul = p + left; + skip = left; + } + + if (nul - p == 0) /* Skip empty strings */ + continue; + + /* Only care about strings starting with either of these two prefixes */ + if ((n = memory_startswith(p, nul - p, "io.systemd.credential:"))) + unbase64 = false; + else if ((n = memory_startswith(p, nul - p, "io.systemd.credential.binary:"))) + unbase64 = true; + else { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape_length(p, nul - p); + log_debug("Ignoring OEM string: %s", strnull(escaped)); + continue; + } + + eq = memchr(n, '=', nul - n); + if (!eq) { + log_warning("SMBIOS OEM string lacks '=' character, ignoring."); + continue; + } + + cn = memdup_suffix0(n, eq - n); + if (!cn) + return log_oom(); + + if (!credential_name_valid(cn)) { + log_warning("SMBIOS credential name '%s' is not valid, ignoring: %m", cn); + continue; + } + + /* Optionally base64 decode the data, if requested, to allow binary credentials */ + if (unbase64) { + r = unbase64mem(eq + 1, nul - (eq + 1), &buf, &buflen); + if (r < 0) { + log_warning_errno(r, "Failed to base64 decode credential '%s', ignoring: %m", cn); + continue; + } + + cdata = buf; + cdata_len = buflen; + } else { + cdata = eq + 1; + cdata_len = nul - (eq + 1); + } + + if (!credential_size_ok(c, cn, cdata_len)) + continue; + + r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true); + if (r < 0) + return r; + + nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, cn); + if (nfd == -EEXIST) + continue; + if (nfd < 0) + return nfd; + + r = loop_write(nfd, cdata, cdata_len); + if (r < 0) { + (void) unlinkat(c->target_dir_fd, cn, 0); + return log_error_errno(r, "Failed to write credential: %m"); + } + + c->size_sum += cdata_len; + c->n_credentials++; + + log_debug("Successfully processed SMBIOS credential '%s'.", cn); + } + + return 0; +} + +static int import_credentials_smbios(ImportCredentialContext *c) { + int r; + + /* Parses DMI OEM strings fields (SMBIOS type 11), as settable with qemu's -smbios type=11,value=… switch. */ + + if (detect_container() > 0) /* don't access /sys/ in a container */ + return 0; + + if (detect_confidential_virtualization() > 0) /* don't trust firmware if confidential VMs */ + return 0; + + for (unsigned i = 0;; i++) { + struct dmi_field_header { + uint8_t type; + uint8_t length; + uint16_t handle; + uint8_t count; + char contents[]; + } _packed_ *dmi_field_header; + _cleanup_free_ char *p = NULL; + _cleanup_free_ void *data = NULL; + size_t size; + + assert_cc(offsetof(struct dmi_field_header, contents) == 5); + + if (asprintf(&p, "/sys/firmware/dmi/entries/11-%u/raw", i) < 0) + return log_oom(); + + r = read_virtual_file(p, sizeof(dmi_field_header) + CREDENTIALS_TOTAL_SIZE_MAX, (char**) &data, &size); + if (r < 0) { + /* Once we reach ENOENT there are no more DMI Type 11 fields around. */ + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to open '%s', ignoring: %m", p); + break; + } + + if (size < offsetof(struct dmi_field_header, contents)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "DMI field header of '%s' too short.", p); + + dmi_field_header = data; + if (dmi_field_header->type != 11 || + dmi_field_header->length != offsetof(struct dmi_field_header, contents)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid DMI field header."); + + r = parse_smbios_strings(c, dmi_field_header->contents, size - offsetof(struct dmi_field_header, contents)); + if (r < 0) + return r; + + if (i == UINT_MAX) /* Prevent overflow */ + break; + } + + return 0; +} + +static int import_credentials_initrd(ImportCredentialContext *c) { + _cleanup_free_ DirectoryEntries *de = NULL; + _cleanup_close_ int source_dir_fd = -EBADF; + int r; + + assert(c); + + /* This imports credentials from /run/credentials/@initrd/ into our credentials directory and deletes + * the source directory afterwards. This is run once after the initrd → host transition. This is + * supposed to establish a well-defined avenue for initrd-based host configurators to pass + * credentials into the main system. */ + + if (in_initrd()) + return 0; + + source_dir_fd = open("/run/credentials/@initrd", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + if (source_dir_fd < 0) { + if (errno == ENOENT) + log_debug_errno(errno, "No credentials passed from initrd."); + else + log_warning_errno(errno, "Failed to open '/run/credentials/@initrd', ignoring: %m"); + return 0; + } + + r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) { + log_warning_errno(r, "Failed to read '/run/credentials/@initrd' contents, ignoring: %m"); + return 0; + } + + FOREACH_ARRAY(entry, de->entries, de->n_entries) { + _cleanup_close_ int cfd = -EBADF, nfd = -EBADF; + const struct dirent *d = *entry; + struct stat st; + + if (!credential_name_valid(d->d_name)) { + log_warning("Credential '%s' has invalid name, ignoring.", d->d_name); + continue; + } + + cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC); + if (cfd < 0) { + log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name); + continue; + } + + if (fstat(cfd, &st) < 0) { + log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name); + continue; + } + + r = stat_verify_regular(&st); + if (r < 0) { + log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name); + continue; + } + + if (!credential_size_ok(c, d->d_name, st.st_size)) + continue; + + r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true); + if (r < 0) + return r; + + nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name); + if (nfd == -EEXIST) + continue; + if (nfd < 0) + return nfd; + + r = copy_bytes(cfd, nfd, st.st_size, 0); + if (r < 0) { + (void) unlinkat(c->target_dir_fd, d->d_name, 0); + return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name); + } + + c->size_sum += st.st_size; + c->n_credentials++; + + log_debug("Successfully copied initrd credential '%s'.", d->d_name); + + (void) unlinkat(source_dir_fd, d->d_name, 0); + } + + source_dir_fd = safe_close(source_dir_fd); + + if (rmdir("/run/credentials/@initrd") < 0) + log_warning_errno(errno, "Failed to remove /run/credentials/@initrd after import, ignoring: %m"); + + return 0; +} + +static int import_credentials_trusted(void) { + _cleanup_(import_credentials_context_free) ImportCredentialContext c = { + .target_dir_fd = -EBADF, + }; + int q, w, r, y; + + /* This is invoked during early boot when no credentials have been imported so far. (Specifically, if + * the $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables are not set + * yet.) */ + + r = import_credentials_qemu(&c); + w = import_credentials_smbios(&c); + q = import_credentials_proc_cmdline(&c); + y = import_credentials_initrd(&c); + + if (c.n_credentials > 0) { + int z; + + log_debug("Imported %u credentials from kernel command line/smbios/fw_cfg/initrd.", c.n_credentials); + + z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY"); + if (z < 0) + return z; + } + + return r < 0 ? r : w < 0 ? w : q < 0 ? q : y; +} + +static int merge_credentials_trusted(const char *creds_dir) { + _cleanup_(import_credentials_context_free) ImportCredentialContext c = { + .target_dir_fd = -EBADF, + }; + int r; + + /* This is invoked after the initrd → host transitions, when credentials already have been imported, + * but we might want to import some more from the initrd. */ + + if (in_initrd()) + return 0; + + /* Do not try to merge initrd credentials into foreign credentials directories */ + if (!path_equal_ptr(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) { + log_debug("Not importing initrd credentials, as foreign $CREDENTIALS_DIRECTORY has been set."); + return 0; + } + + r = import_credentials_initrd(&c); + + if (c.n_credentials > 0) { + int z; + + log_debug("Merged %u credentials from initrd.", c.n_credentials); + + z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY"); + if (z < 0) + return z; + } + + return r; +} + +static int symlink_credential_dir(const char *envvar, const char *path, const char *where) { + int r; + + assert(envvar); + assert(path); + assert(where); + + if (!path_is_valid(path) || !path_is_absolute(path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "String specified via $%s is not a valid absolute path, refusing: %s", envvar, path); + + /* If the env var already points to where we intend to create the symlink, then most likely we + * already imported some creds earlier, and thus set the env var, and hence don't need to do + * anything. */ + if (path_equal(path, where)) + return 0; + + r = symlink_idempotent(path, where, /* make_relative= */ true); + if (r < 0) + return log_error_errno(r, "Failed to link $%s to %s: %m", envvar, where); + + return 0; +} + +static int setenv_notify_socket(void) { + _cleanup_free_ char *address = NULL; + int r; + + r = read_credential_with_decryption("vmm.notify_socket", (void **)&address, /* ret_size= */ NULL); + if (r < 0) + return log_warning_errno(r, "Failed to read 'vmm.notify_socket' credential, ignoring: %m"); + + if (isempty(address)) + return 0; + + if (setenv("NOTIFY_SOCKET", address, /* replace= */ 1) < 0) + return log_warning_errno(errno, "Failed to set $NOTIFY_SOCKET environment variable, ignoring: %m"); + + return 1; +} + +static int report_credentials_per_func(const char *title, int (*get_directory_func)(const char **ret)) { + _cleanup_free_ DirectoryEntries *de = NULL; + _cleanup_close_ int dir_fd = -EBADF; + _cleanup_free_ char *ll = NULL; + const char *d = NULL; + int r, c = 0; + + assert(title); + assert(get_directory_func); + + r = get_directory_func(&d); + if (r < 0) { + if (r == -ENXIO) /* Env var not set */ + return 0; + + return log_warning_errno(r, "Failed to determine %s directory: %m", title); + } + + dir_fd = open(d, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (dir_fd < 0) + return log_warning_errno(errno, "Failed to open credentials directory %s: %m", d); + + r = readdir_all(dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + return log_warning_errno(r, "Failed to enumerate credentials directory %s: %m", d); + + FOREACH_ARRAY(entry, de->entries, de->n_entries) { + const struct dirent *e = *entry; + + if (!credential_name_valid(e->d_name)) + continue; + + if (!strextend_with_separator(&ll, ", ", e->d_name)) + return log_oom(); + + c++; + } + + if (ll) + log_info("Received %s: %s", title, ll); + + return c; +} + +static void report_credentials(void) { + int p, q; + + p = report_credentials_per_func("regular credentials", get_credentials_dir); + q = report_credentials_per_func("untrusted credentials", get_encrypted_credentials_dir); + + log_full(p > 0 || q > 0 ? LOG_INFO : LOG_DEBUG, + "Acquired %i regular credentials, %i untrusted credentials.", + p > 0 ? p : 0, + q > 0 ? q : 0); +} + +int import_credentials(void) { + const char *received_creds_dir = NULL, *received_encrypted_creds_dir = NULL; + bool envvar_set = false; + int r, q; + + r = get_credentials_dir(&received_creds_dir); + if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */ + log_warning_errno(r, "Failed to determine credentials directory, ignoring: %m"); + + envvar_set = r >= 0; + + r = get_encrypted_credentials_dir(&received_encrypted_creds_dir); + if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */ + log_warning_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m"); + + envvar_set = envvar_set || r >= 0; + + if (envvar_set) { + /* Maybe an earlier stage initrd already set this up? If so, don't try to import anything again. */ + log_debug("Not importing credentials, $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY already set."); + + /* But, let's make sure the creds are available from our regular paths. */ + if (received_creds_dir) + r = symlink_credential_dir("CREDENTIALS_DIRECTORY", received_creds_dir, SYSTEM_CREDENTIALS_DIRECTORY); + else + r = 0; + + if (received_encrypted_creds_dir) { + q = symlink_credential_dir("ENCRYPTED_CREDENTIALS_DIRECTORY", received_encrypted_creds_dir, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY); + if (r >= 0) + r = q; + } + + q = merge_credentials_trusted(received_creds_dir); + if (r >= 0) + r = q; + + } else { + _cleanup_free_ char *v = NULL; + + r = proc_cmdline_get_key("systemd.import_credentials", PROC_CMDLINE_STRIP_RD_PREFIX, &v); + if (r < 0) + log_debug_errno(r, "Failed to check if 'systemd.import_credentials=' kernel command line option is set, ignoring: %m"); + else if (r > 0) { + r = parse_boolean(v); + if (r < 0) + log_debug_errno(r, "Failed to parse 'systemd.import_credentials=' parameter, ignoring: %m"); + else if (r == 0) { + log_notice("systemd.import_credentials=no is set, skipping importing of credentials."); + return 0; + } + } + + r = import_credentials_boot(); + + q = import_credentials_trusted(); + if (r >= 0) + r = q; + } + + report_credentials(); + + /* Propagate vmm_notify_socket credential → $NOTIFY_SOCKET env var */ + (void) setenv_notify_socket(); + + return r; +} diff --git a/src/core/import-creds.h b/src/core/import-creds.h new file mode 100644 index 0000000..a87865c --- /dev/null +++ b/src/core/import-creds.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int import_credentials(void); diff --git a/src/core/job.c b/src/core/job.c new file mode 100644 index 0000000..e78c2a7 --- /dev/null +++ b/src/core/job.c @@ -0,0 +1,1712 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-id128.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "cgroup.h" +#include "dbus-job.h" +#include "dbus.h" +#include "escape.h" +#include "fileio.h" +#include "job.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "serialize.h" +#include "set.h" +#include "sort-util.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "unit.h" +#include "virt.h" + +Job* job_new_raw(Unit *unit) { + Job *j; + + /* used for deserialization */ + + assert(unit); + + j = new(Job, 1); + if (!j) + return NULL; + + *j = (Job) { + .manager = unit->manager, + .unit = unit, + .type = _JOB_TYPE_INVALID, + }; + + return j; +} + +static uint32_t manager_get_new_job_id(Manager *m) { + bool overflow = false; + + assert(m); + + for (;;) { + uint32_t id = m->current_job_id; + + if (_unlikely_(id == UINT32_MAX)) { + assert_se(!overflow); + m->current_job_id = 1; + overflow = true; + } else + m->current_job_id++; + + if (hashmap_contains(m->jobs, UINT32_TO_PTR(id))) + continue; + + return id; + } +} + +Job* job_new(Unit *unit, JobType type) { + Job *j; + + assert(type < _JOB_TYPE_MAX); + + j = job_new_raw(unit); + if (!j) + return NULL; + + j->id = manager_get_new_job_id(j->manager); + j->type = type; + + /* We don't link it here, that's what job_dependency() is for */ + + return j; +} + +void job_unlink(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + if (j->in_run_queue) { + prioq_remove(j->manager->run_queue, j, &j->run_queue_idx); + j->in_run_queue = false; + } + + if (j->in_dbus_queue) { + LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + } + + if (j->in_gc_queue) { + LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j); + j->in_gc_queue = false; + } + + j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source); +} + +Job* job_free(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + job_unlink(j); + + sd_bus_track_unref(j->bus_track); + strv_free(j->deserialized_clients); + + activation_details_unref(j->activation_details); + + return mfree(j); +} + +static void job_set_state(Job *j, JobState state) { + assert(j); + assert(state >= 0); + assert(state < _JOB_STATE_MAX); + + if (j->state == state) + return; + + j->state = state; + + if (!j->installed) + return; + + if (j->state == JOB_RUNNING) + j->unit->manager->n_running_jobs++; + else { + assert(j->state == JOB_WAITING); + assert(j->unit->manager->n_running_jobs > 0); + + j->unit->manager->n_running_jobs--; + + if (j->unit->manager->n_running_jobs <= 0) + j->unit->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->unit->manager->jobs_in_progress_event_source); + } +} + +void job_uninstall(Job *j) { + Job **pj; + + assert(j->installed); + + job_set_state(j, JOB_WAITING); + + pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job; + assert(*pj == j); + + /* Detach from next 'bigger' objects */ + + /* daemon-reload should be transparent to job observers */ + if (!MANAGER_IS_RELOADING(j->manager)) + bus_job_send_removed_signal(j); + + *pj = NULL; + + unit_add_to_gc_queue(j->unit); + + unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */ + + hashmap_remove_value(j->manager->jobs, UINT32_TO_PTR(j->id), j); + j->installed = false; +} + +static bool job_type_allows_late_merge(JobType t) { + /* Tells whether it is OK to merge a job of type 't' with an already + * running job. + * Reloads cannot be merged this way. Think of the sequence: + * 1. Reload of a daemon is in progress; the daemon has already loaded + * its config file, but hasn't completed the reload operation yet. + * 2. Edit foo's config file. + * 3. Trigger another reload to have the daemon use the new config. + * Should the second reload job be merged into the first one, the daemon + * would not know about the new config. + * JOB_RESTART jobs on the other hand can be merged, because they get + * patched into JOB_START after stopping the unit. So if we see a + * JOB_RESTART running, it means the unit hasn't stopped yet and at + * this time the merge is still allowed. */ + return t != JOB_RELOAD; +} + +static void job_merge_into_installed(Job *j, Job *other) { + assert(j->installed); + assert(j->unit == other->unit); + + if (j->type != JOB_NOP) { + assert_se(job_type_merge_and_collapse(&j->type, other->type, j->unit) == 0); + + /* Keep the oldest ActivationDetails, if any */ + if (!j->activation_details) + j->activation_details = TAKE_PTR(other->activation_details); + } else + assert(other->type == JOB_NOP); + + j->irreversible = j->irreversible || other->irreversible; + j->ignore_order = j->ignore_order || other->ignore_order; +} + +Job* job_install(Job *j, bool refuse_late_merge) { + Job **pj; + Job *uj; + + assert(j); + assert(!j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(j->state == JOB_WAITING); + + pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job; + uj = *pj; + + if (uj) { + if (job_type_is_conflicting(uj->type, j->type)) + job_finish_and_invalidate(uj, JOB_CANCELED, false, false); + else { + /* not conflicting, i.e. mergeable */ + + if (uj->state == JOB_WAITING || + (!refuse_late_merge && job_type_allows_late_merge(j->type) && job_type_is_superset(uj->type, j->type))) { + job_merge_into_installed(uj, j); + log_unit_debug(uj->unit, + "Merged %s/%s into installed job %s/%s as %"PRIu32, + j->unit->id, job_type_to_string(j->type), uj->unit->id, + job_type_to_string(uj->type), uj->id); + return uj; + } else { + /* already running and not safe to merge into */ + /* Patch uj to become a merged job and re-run it. */ + /* XXX It should be safer to queue j to run after uj finishes, but it is + * not currently possible to have more than one installed job per unit. */ + job_merge_into_installed(uj, j); + log_unit_debug(uj->unit, + "Merged into running job, re-running: %s/%s as %"PRIu32, + uj->unit->id, job_type_to_string(uj->type), uj->id); + + job_set_state(uj, JOB_WAITING); + return uj; + } + } + } + + /* Install the job */ + assert(!*pj); + *pj = j; + j->installed = true; + + j->manager->n_installed_jobs++; + log_unit_debug(j->unit, + "Installed new job %s/%s as %u", + j->unit->id, job_type_to_string(j->type), (unsigned) j->id); + + job_add_to_gc_queue(j); + + job_add_to_dbus_queue(j); /* announce this job to clients */ + unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */ + + return j; +} + +int job_install_deserialized(Job *j) { + Job **pj; + int r; + + assert(!j->installed); + + if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION) + return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EINVAL), + "Invalid job type %s in deserialization.", + strna(job_type_to_string(j->type))); + + pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job; + if (*pj) + return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EEXIST), + "Unit already has a job installed. Not installing deserialized job."); + + /* When the job does not have ID, or we failed to deserialize the job ID, then use a new ID. */ + if (j->id <= 0) + j->id = manager_get_new_job_id(j->manager); + + r = hashmap_ensure_put(&j->manager->jobs, NULL, UINT32_TO_PTR(j->id), j); + if (r == -EEXIST) + return log_unit_debug_errno(j->unit, r, "Job ID %" PRIu32 " already used, cannot deserialize job.", j->id); + if (r < 0) + return log_unit_debug_errno(j->unit, r, "Failed to insert job into jobs hash table: %m"); + + *pj = j; + j->installed = true; + + if (j->state == JOB_RUNNING) + j->unit->manager->n_running_jobs++; + + log_unit_debug(j->unit, + "Reinstalled deserialized job %s/%s as %u", + j->unit->id, job_type_to_string(j->type), (unsigned) j->id); + return 0; +} + +JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts) { + JobDependency *l; + + assert(object); + + /* Adds a new job link, which encodes that the 'subject' job + * needs the 'object' job in some way. If 'subject' is NULL + * this means the 'anchor' job (i.e. the one the user + * explicitly asked for) is the requester. */ + + l = new0(JobDependency, 1); + if (!l) + return NULL; + + l->subject = subject; + l->object = object; + l->matters = matters; + l->conflicts = conflicts; + + if (subject) + LIST_PREPEND(subject, subject->subject_list, l); + + LIST_PREPEND(object, object->object_list, l); + + return l; +} + +void job_dependency_free(JobDependency *l) { + assert(l); + + if (l->subject) + LIST_REMOVE(subject, l->subject->subject_list, l); + + LIST_REMOVE(object, l->object->object_list, l); + + free(l); +} + +void job_dump(Job *j, FILE *f, const char *prefix) { + assert(j); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%s-> Job %u:\n" + "%s\tAction: %s -> %s\n" + "%s\tState: %s\n" + "%s\tIrreversible: %s\n" + "%s\tMay GC: %s\n", + prefix, j->id, + prefix, j->unit->id, job_type_to_string(j->type), + prefix, job_state_to_string(j->state), + prefix, yes_no(j->irreversible), + prefix, yes_no(job_may_gc(j))); +} + +/* + * Merging is commutative, so imagine the matrix as symmetric. We store only + * its lower triangle to avoid duplication. We don't store the main diagonal, + * because A merged with A is simply A. + * + * If the resulting type is collapsed immediately afterwards (to get rid of + * the JOB_RELOAD_OR_START, which lies outside the lookup function's domain), + * the following properties hold: + * + * Merging is associative! A merged with B, and then merged with C is the same + * as A merged with the result of B merged with C. + * + * Mergeability is transitive! If A can be merged with B and B with C then + * A also with C. + * + * Also, if A merged with B cannot be merged with C, then either A or B cannot + * be merged with C either. + */ +static const JobType job_merging_table[] = { +/* What \ With * JOB_START JOB_VERIFY_ACTIVE JOB_STOP JOB_RELOAD */ +/*********************************************************************************/ +/*JOB_START */ +/*JOB_VERIFY_ACTIVE */ JOB_START, +/*JOB_STOP */ -1, -1, +/*JOB_RELOAD */ JOB_RELOAD_OR_START, JOB_RELOAD, -1, +/*JOB_RESTART */ JOB_RESTART, JOB_RESTART, -1, JOB_RESTART, +}; + +JobType job_type_lookup_merge(JobType a, JobType b) { + assert_cc(ELEMENTSOF(job_merging_table) == _JOB_TYPE_MAX_MERGING * (_JOB_TYPE_MAX_MERGING - 1) / 2); + assert(a >= 0 && a < _JOB_TYPE_MAX_MERGING); + assert(b >= 0 && b < _JOB_TYPE_MAX_MERGING); + + if (a == b) + return a; + + if (a < b) { + JobType tmp = a; + a = b; + b = tmp; + } + + return job_merging_table[(a - 1) * a / 2 + b]; +} + +bool job_type_is_redundant(JobType a, UnitActiveState b) { + switch (a) { + + case JOB_START: + return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING); + + case JOB_STOP: + return IN_SET(b, UNIT_INACTIVE, UNIT_FAILED); + + case JOB_VERIFY_ACTIVE: + return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING); + + case JOB_RELOAD: + return + b == UNIT_RELOADING; + + case JOB_RESTART: + /* Restart jobs must always be kept. + * + * For ACTIVE/RELOADING units, this is obvious. + * + * For ACTIVATING units, it's more subtle: + * + * Generally, if a service Requires= another unit, restarts of + * the unit must be propagated to the service. If the service is + * ACTIVATING, it must still be restarted since it might have + * stale information regarding the other unit. + * + * For example, consider a service that Requires= a socket: if + * the socket is restarted, but the service is still ACTIVATING, + * it's necessary to restart the service so that it gets the new + * socket. */ + return false; + + case JOB_NOP: + return true; + + default: + assert_not_reached(); + } +} + +JobType job_type_collapse(JobType t, Unit *u) { + UnitActiveState s; + + switch (t) { + + case JOB_TRY_RESTART: + /* Be sure to keep the restart job even if the unit is + * ACTIVATING. + * + * See the job_type_is_redundant(JOB_RESTART) for more info */ + s = unit_active_state(u); + if (!UNIT_IS_ACTIVE_OR_ACTIVATING(s)) + return JOB_NOP; + + return JOB_RESTART; + + case JOB_TRY_RELOAD: + s = unit_active_state(u); + if (!UNIT_IS_ACTIVE_OR_RELOADING(s)) + return JOB_NOP; + + return JOB_RELOAD; + + case JOB_RELOAD_OR_START: + s = unit_active_state(u); + if (!UNIT_IS_ACTIVE_OR_RELOADING(s)) + return JOB_START; + + return JOB_RELOAD; + + default: + return t; + } +} + +int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u) { + JobType t; + + t = job_type_lookup_merge(*a, b); + if (t < 0) + return -EEXIST; + + *a = job_type_collapse(t, u); + return 0; +} + +static bool job_is_runnable(Job *j) { + Unit *other; + + assert(j); + assert(j->installed); + + /* Checks whether there is any job running for the units this + * job needs to be running after (in the case of a 'positive' + * job type) or before (in the case of a 'negative' job + * type. */ + + /* Note that unit types have a say in what is runnable, + * too. For example, if they return -EAGAIN from + * unit_start() they can indicate they are not + * runnable yet. */ + + /* First check if there is an override */ + if (j->ignore_order) + return true; + + if (j->type == JOB_NOP) + return true; + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) + if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) > 0) { + log_unit_debug(j->unit, + "starting held back, waiting for: %s", + other->id); + return false; + } + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) + if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) > 0) { + log_unit_debug(j->unit, + "stopping held back, waiting for: %s", + other->id); + return false; + } + + return true; +} + +static void job_change_type(Job *j, JobType newtype) { + assert(j); + + log_unit_debug(j->unit, + "Converting job %s/%s -> %s/%s", + j->unit->id, job_type_to_string(j->type), + j->unit->id, job_type_to_string(newtype)); + + j->type = newtype; +} + +static const char* job_start_message_format(Unit *u, JobType t) { + assert(u); + assert(IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD)); + + if (t == JOB_RELOAD) + return "Reloading %s..."; + else if (t == JOB_START) + return UNIT_VTABLE(u)->status_message_formats.starting_stopping[0] ?: "Starting %s..."; + else + return UNIT_VTABLE(u)->status_message_formats.starting_stopping[1] ?: "Stopping %s..."; +} + +static void job_emit_start_message(Unit *u, uint32_t job_id, JobType t) { + _cleanup_free_ char *free_ident = NULL; + const char *ident, *format; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + assert(u->id); /* We better don't try to run a unit that doesn't even have an id. */ + + if (!IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD)) + return; + + if (!unit_log_level_test(u, LOG_INFO)) + return; + + format = job_start_message_format(u, t); + ident = unit_status_string(u, &free_ident); + + bool do_console = t != JOB_RELOAD; + bool console_only = do_console && log_on_console(); /* Reload status messages have traditionally + * not been printed to the console. */ + + /* Print to the log first. */ + if (!console_only) { /* Skip this if it would only go on the console anyway */ + + const char *mid = + t == JOB_START ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTING_STR : + t == JOB_STOP ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPING_STR : + "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADING_STR; + const char *msg_fmt = strjoina("MESSAGE=", format); + + /* Note that we deliberately use LOG_MESSAGE() instead of LOG_UNIT_MESSAGE() here, since this + * is supposed to mimic closely what is written to screen using the status output, which is + * supposed to be high level friendly output. */ + + DISABLE_WARNING_FORMAT_NONLITERAL; + log_unit_struct(u, LOG_INFO, + msg_fmt, ident, + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + LOG_UNIT_INVOCATION_ID(u), + mid); + REENABLE_WARNING; + } + + /* Log to the console second. */ + if (do_console) { + DISABLE_WARNING_FORMAT_NONLITERAL; + unit_status_printf(u, STATUS_TYPE_NORMAL, "", format, ident); + REENABLE_WARNING; + } +} + +static const char* job_done_message_format(Unit *u, JobType t, JobResult result) { + static const char* const generic_finished_start_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Started %s.", + [JOB_TIMEOUT] = "Timed out starting %s.", + [JOB_FAILED] = "Failed to start %s.", + [JOB_DEPENDENCY] = "Dependency failed for %s.", + [JOB_ASSERT] = "Assertion failed for %s.", + [JOB_UNSUPPORTED] = "Starting of %s unsupported.", + [JOB_COLLECTED] = "Unnecessary job was removed for %s.", + [JOB_ONCE] = "Unit %s has been started before and cannot be started again.", + }; + static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Stopped %s.", + [JOB_FAILED] = "Stopped %s with error.", + [JOB_TIMEOUT] = "Timed out stopping %s.", + }; + static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Reloaded %s.", + [JOB_FAILED] = "Reload failed for %s.", + [JOB_TIMEOUT] = "Timed out reloading %s.", + }; + /* When verify-active detects the unit is inactive, report it. + * Most likely a DEPEND warning from a requisiting unit will + * occur next and it's nice to see what was requisited. */ + static const char* const generic_finished_verify_active_job[_JOB_RESULT_MAX] = { + [JOB_SKIPPED] = "%s is inactive.", + }; + const char *format; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + /* Show condition check message if the job did not actually do anything due to unmet condition. */ + if (t == JOB_START && result == JOB_DONE && !u->condition_result) + return "Condition check resulted in %s being skipped."; + + if (IN_SET(t, JOB_START, JOB_STOP, JOB_RESTART)) { + const UnitStatusMessageFormats *formats = &UNIT_VTABLE(u)->status_message_formats; + if (formats->finished_job) { + format = formats->finished_job(u, t, result); + if (format) + return format; + } + + format = (t == JOB_START ? formats->finished_start_job : formats->finished_stop_job)[result]; + if (format) + return format; + } + + /* Return generic strings */ + switch (t) { + case JOB_START: + return generic_finished_start_job[result]; + case JOB_STOP: + case JOB_RESTART: + return generic_finished_stop_job[result]; + case JOB_RELOAD: + return generic_finished_reload_job[result]; + case JOB_VERIFY_ACTIVE: + return generic_finished_verify_active_job[result]; + default: + return NULL; + } +} + +static const struct { + int log_level; + const char *color, *word; +} job_done_messages[_JOB_RESULT_MAX] = { + [JOB_DONE] = { LOG_INFO, ANSI_OK_COLOR, " OK " }, + [JOB_CANCELED] = { LOG_INFO, }, + [JOB_TIMEOUT] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " TIME " }, + [JOB_FAILED] = { LOG_ERR, ANSI_HIGHLIGHT_RED, "FAILED" }, + [JOB_DEPENDENCY] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "DEPEND" }, + [JOB_SKIPPED] = { LOG_NOTICE, ANSI_HIGHLIGHT, " INFO " }, + [JOB_INVALID] = { LOG_INFO, }, + [JOB_ASSERT] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "ASSERT" }, + [JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" }, + [JOB_COLLECTED] = { LOG_INFO, }, + [JOB_ONCE] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " ONCE " }, +}; + +static const char* job_done_mid(JobType type, JobResult result) { + switch (type) { + case JOB_START: + if (result == JOB_DONE) + return "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR; + else + return "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILED_STR; + + case JOB_RELOAD: + return "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADED_STR; + + case JOB_STOP: + case JOB_RESTART: + return "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPED_STR; + + default: + return NULL; + } +} + +static void job_emit_done_message(Unit *u, uint32_t job_id, JobType t, JobResult result) { + _cleanup_free_ char *free_ident = NULL; + const char *ident, *format; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (!unit_log_level_test(u, job_done_messages[result].log_level)) + return; + + format = job_done_message_format(u, t, result); + if (!format) + return; + + ident = unit_status_string(u, &free_ident); + + const char *status = job_done_messages[result].word; + bool do_console = t != JOB_RELOAD && status; + bool console_only = do_console && log_on_console(); + + if (t == JOB_START && result == JOB_DONE && !u->condition_result) { + /* No message on the console if the job did not actually do anything due to unmet condition. */ + if (console_only) + return; + else + do_console = false; + } + + if (!console_only) { /* Skip printing if output goes to the console, and job_print_status_message() + * will actually print something to the console. */ + Condition *c; + const char *mid = job_done_mid(t, result); /* mid may be NULL. log_unit_struct() will ignore it. */ + + c = t == JOB_START && result == JOB_DONE ? unit_find_failed_condition(u) : NULL; + if (c) { + /* Special case units that were skipped because of a unmet condition check so that + * we can add more information to the message. */ + if (c->trigger) + log_unit_struct( + u, + job_done_messages[result].log_level, + LOG_MESSAGE("%s was skipped because no trigger condition checks were met.", + ident), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_INVOCATION_ID(u), + mid); + else + log_unit_struct( + u, + job_done_messages[result].log_level, + LOG_MESSAGE("%s was skipped because of an unmet condition check (%s=%s%s).", + ident, + condition_type_to_string(c->type), + c->negate ? "!" : "", + c->parameter), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_INVOCATION_ID(u), + mid); + } else { + const char *msg_fmt = strjoina("MESSAGE=", format); + + DISABLE_WARNING_FORMAT_NONLITERAL; + log_unit_struct(u, job_done_messages[result].log_level, + msg_fmt, ident, + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_INVOCATION_ID(u), + mid); + REENABLE_WARNING; + } + } + + if (do_console) { + if (log_get_show_color()) + status = strjoina(job_done_messages[result].color, + status, + ANSI_NORMAL); + + DISABLE_WARNING_FORMAT_NONLITERAL; + unit_status_printf(u, + result == JOB_DONE ? STATUS_TYPE_NORMAL : STATUS_TYPE_NOTICE, + status, format, ident); + REENABLE_WARNING; + + if (t == JOB_START && result == JOB_FAILED) { + _cleanup_free_ char *quoted = NULL; + + quoted = shell_maybe_quote(u->id, 0); + if (quoted) + manager_status_printf(u->manager, STATUS_TYPE_NORMAL, NULL, + "See 'systemctl status %s' for details.", quoted); + } + } +} + +static int job_perform_on_unit(Job **j) { + ActivationDetails *a; + uint32_t id; + Manager *m; + JobType t; + Unit *u; + bool wait_only; + int r; + + /* While we execute this operation the job might go away (for example: because it finishes immediately + * or is replaced by a new, conflicting job). To make sure we don't access a freed job later on we + * store the id here, so that we can verify the job is still valid. */ + + assert(j); + assert(*j); + + m = (*j)->manager; + u = (*j)->unit; + t = (*j)->type; + id = (*j)->id; + a = (*j)->activation_details; + + switch (t) { + case JOB_START: + r = unit_start(u, a); + wait_only = r == -EBADR; /* If the unit type does not support starting, then simply wait. */ + break; + + case JOB_RESTART: + t = JOB_STOP; + _fallthrough_; + case JOB_STOP: + r = unit_stop(u); + wait_only = r == -EBADR; /* If the unit type does not support stopping, then simply wait. */ + break; + + case JOB_RELOAD: + r = unit_reload(u); + wait_only = false; /* A clear error is generated if reload is not supported. */ + break; + + default: + assert_not_reached(); + } + + /* Log if the job still exists and the start/stop/reload function actually did something or we're + * only waiting for unit status change (common for device units). The latter ensures that job start + * messages for device units are correctly shown. Note that if the job disappears too quickly, e.g. + * for units for which there's no 'activating' phase (i.e. because we transition directly from + * 'inactive' to 'active'), we'll possibly skip the "Starting..." message. */ + *j = manager_get_job(m, id); + if (*j && (r > 0 || wait_only)) + job_emit_start_message(u, id, t); + + return wait_only ? 0 : r; +} + +int job_run_and_invalidate(Job *j) { + int r; + + assert(j); + assert(j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(j->in_run_queue); + + prioq_remove(j->manager->run_queue, j, &j->run_queue_idx); + j->in_run_queue = false; + + if (j->state != JOB_WAITING) + return 0; + + if (!job_is_runnable(j)) + return -EAGAIN; + + job_start_timer(j, true); + job_set_state(j, JOB_RUNNING); + job_add_to_dbus_queue(j); + + switch (j->type) { + + case JOB_VERIFY_ACTIVE: { + UnitActiveState t; + + t = unit_active_state(j->unit); + if (UNIT_IS_ACTIVE_OR_RELOADING(t)) + r = -EALREADY; + else if (t == UNIT_ACTIVATING) + r = -EAGAIN; + else + r = -EBADR; + break; + } + + case JOB_START: + case JOB_STOP: + case JOB_RESTART: + case JOB_RELOAD: + r = job_perform_on_unit(&j); + break; + + case JOB_NOP: + r = -EALREADY; + break; + + default: + assert_not_reached(); + } + + if (j) { + if (r == -EAGAIN) + job_set_state(j, JOB_WAITING); /* Hmm, not ready after all, let's return to JOB_WAITING state */ + else if (r == -EALREADY) /* already being executed */ + r = job_finish_and_invalidate(j, JOB_DONE, true, true); + else if (r == -ECOMM) + r = job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (r == -EBADR) + r = job_finish_and_invalidate(j, JOB_SKIPPED, true, false); + else if (r == -ENOEXEC) + r = job_finish_and_invalidate(j, JOB_INVALID, true, false); + else if (r == -EPROTO) + r = job_finish_and_invalidate(j, JOB_ASSERT, true, false); + else if (r == -EOPNOTSUPP) + r = job_finish_and_invalidate(j, JOB_UNSUPPORTED, true, false); + else if (r == -ENOLINK) + r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); + else if (r == -ESTALE) + r = job_finish_and_invalidate(j, JOB_ONCE, true, false); + else if (r < 0) + r = job_finish_and_invalidate(j, JOB_FAILED, true, false); + } + + return r; +} + +static void job_fail_dependencies(Unit *u, UnitDependencyAtom match_atom) { + Unit *other; + + assert(u); + + UNIT_FOREACH_DEPENDENCY(other, u, match_atom) { + Job *j = other->job; + + if (!j) + continue; + if (!IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE)) + continue; + + job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); + } +} + +int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) { + Unit *u, *other; + JobType t; + + assert(j); + assert(j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + + u = j->unit; + t = j->type; + + j->result = result; + + log_unit_debug(u, "Job %" PRIu32 " %s/%s finished, result=%s", + j->id, u->id, job_type_to_string(t), job_result_to_string(result)); + + /* If this job did nothing to the respective unit we don't log the status message */ + if (!already) + job_emit_done_message(u, j->id, t, result); + + /* Patch restart jobs so that they become normal start jobs */ + if (result == JOB_DONE && t == JOB_RESTART) { + + job_change_type(j, JOB_START); + job_set_state(j, JOB_WAITING); + + job_add_to_dbus_queue(j); + job_add_to_run_queue(j); + job_add_to_gc_queue(j); + + goto finish; + } + + if (IN_SET(result, JOB_FAILED, JOB_INVALID)) + j->manager->n_failed_jobs++; + + job_uninstall(j); + job_free(j); + + /* Fail depending jobs on failure */ + if (result != JOB_DONE && recursive) { + if (IN_SET(t, JOB_START, JOB_VERIFY_ACTIVE)) + job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_START_FAILURE); + else if (t == JOB_STOP) + job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_STOP_FAILURE); + } + + /* A special check to make sure we take down anything RequisiteOf= if we aren't active. This is when + * the verify-active job merges with a satisfying job type, and then loses its invalidation effect, + * as the result there is JOB_DONE for the start job we merged into, while we should be failing the + * depending job if the said unit isn't in fact active. Oneshots are an example of this, where going + * directly from activating to inactive is success. + * + * This happens when you use ConditionXYZ= in a unit too, since in that case the job completes with + * the JOB_DONE result, but the unit never really becomes active. Note that such a case still + * involves merging: + * + * A start job waits for something else, and a verify-active comes in and merges in the installed + * job. Then, later, when it becomes runnable, it finishes with JOB_DONE result as execution on + * conditions not being met is skipped, breaking our dependency semantics. + * + * Also, depending on if start job waits or not, the merging may or may not happen (the verify-active + * job may trigger after it finishes), so you get undeterministic results without this check. + */ + if (result == JOB_DONE && recursive && + IN_SET(t, JOB_START, JOB_RELOAD) && + !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE); + + /* Trigger OnFailure= dependencies that are not generated by the unit itself. We don't treat + * JOB_CANCELED as failure in this context. And JOB_FAILURE is already handled by the unit itself. */ + if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) { + log_unit_struct(u, LOG_NOTICE, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_MESSAGE(u, "Job %s/%s failed with result '%s'.", + u->id, + job_type_to_string(t), + job_result_to_string(result))); + + unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode); + } + + unit_trigger_notify(u); + +finish: + /* Try to start the next jobs that can be started */ + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_AFTER) + if (other->job) { + job_add_to_run_queue(other->job); + job_add_to_gc_queue(other->job); + } + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_BEFORE) + if (other->job) { + job_add_to_run_queue(other->job); + job_add_to_gc_queue(other->job); + } + + /* Ensure that when an upheld/unneeded/bound unit activation job fails we requeue it, if it still + * necessary. If there are no state changes in the triggerer, it would not be retried otherwise. */ + unit_submit_to_start_when_upheld_queue(u); + unit_submit_to_stop_when_bound_queue(u); + unit_submit_to_stop_when_unneeded_queue(u); + + manager_check_finished(u->manager); + + return 0; +} + +static int job_dispatch_timer(sd_event_source *s, uint64_t monotonic, void *userdata) { + Job *j = ASSERT_PTR(userdata); + Unit *u; + + assert(s == j->timer_event_source); + + log_unit_warning(j->unit, "Job %s/%s timed out.", j->unit->id, job_type_to_string(j->type)); + + u = j->unit; + job_finish_and_invalidate(j, JOB_TIMEOUT, true, false); + + emergency_action(u->manager, u->job_timeout_action, + EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN, + u->job_timeout_reboot_arg, -1, "job timed out"); + + return 0; +} + +int job_start_timer(Job *j, bool job_running) { + int r; + usec_t timeout_time, old_timeout_time; + + if (job_running) { + j->begin_running_usec = now(CLOCK_MONOTONIC); + + if (j->unit->job_running_timeout == USEC_INFINITY) + return 0; + + timeout_time = usec_add(j->begin_running_usec, j->unit->job_running_timeout); + + if (j->timer_event_source) { + /* Update only if JobRunningTimeoutSec= results in earlier timeout */ + r = sd_event_source_get_time(j->timer_event_source, &old_timeout_time); + if (r < 0) + return r; + + if (old_timeout_time <= timeout_time) + return 0; + + return sd_event_source_set_time(j->timer_event_source, timeout_time); + } + } else { + if (j->timer_event_source) + return 0; + + j->begin_usec = now(CLOCK_MONOTONIC); + + if (j->unit->job_timeout == USEC_INFINITY) + return 0; + + timeout_time = usec_add(j->begin_usec, j->unit->job_timeout); + } + + r = sd_event_add_time( + j->manager->event, + &j->timer_event_source, + CLOCK_MONOTONIC, + timeout_time, 0, + job_dispatch_timer, j); + if (r < 0) + return r; + + (void) sd_event_source_set_description(j->timer_event_source, "job-start"); + + return 0; +} + +void job_add_to_run_queue(Job *j) { + int r; + + assert(j); + assert(j->installed); + + if (j->in_run_queue) + return; + + r = prioq_put(j->manager->run_queue, j, &j->run_queue_idx); + if (r < 0) + log_warning_errno(r, "Failed put job in run queue, ignoring: %m"); + else + j->in_run_queue = true; + + manager_trigger_run_queue(j->manager); +} + +void job_add_to_dbus_queue(Job *j) { + assert(j); + assert(j->installed); + + if (j->in_dbus_queue) + return; + + /* We don't check if anybody is subscribed here, since this + * job might just have been created and not yet assigned to a + * connection/client. */ + + LIST_PREPEND(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = true; +} + +char *job_dbus_path(Job *j) { + char *p; + + assert(j); + + if (asprintf(&p, "/org/freedesktop/systemd1/job/%"PRIu32, j->id) < 0) + return NULL; + + return p; +} + +int job_serialize(Job *j, FILE *f) { + assert(j); + assert(f); + + (void) serialize_item_format(f, "job-id", "%u", j->id); + (void) serialize_item(f, "job-type", job_type_to_string(j->type)); + (void) serialize_item(f, "job-state", job_state_to_string(j->state)); + (void) serialize_bool(f, "job-irreversible", j->irreversible); + (void) serialize_bool(f, "job-sent-dbus-new-signal", j->sent_dbus_new_signal); + (void) serialize_bool(f, "job-ignore-order", j->ignore_order); + + if (j->begin_usec > 0) + (void) serialize_usec(f, "job-begin", j->begin_usec); + if (j->begin_running_usec > 0) + (void) serialize_usec(f, "job-begin-running", j->begin_running_usec); + + bus_track_serialize(j->bus_track, f, "subscribed"); + + activation_details_serialize(j->activation_details, f); + + /* End marker */ + fputc('\n', f); + return 0; +} + +int job_deserialize(Job *j, FILE *f) { + int r; + + assert(j); + assert(f); + + for (;;) { + _cleanup_free_ char *l = NULL; + size_t k; + char *v; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + k = strcspn(l, "="); + + if (l[k] == '=') { + l[k] = 0; + v = l+k+1; + } else + v = l+k; + + if (streq(l, "job-id")) { + + if (safe_atou32(v, &j->id) < 0) + log_debug("Failed to parse job id value: %s", v); + + } else if (streq(l, "job-type")) { + JobType t; + + t = job_type_from_string(v); + if (t < 0) + log_debug("Failed to parse job type: %s", v); + else if (t >= _JOB_TYPE_MAX_IN_TRANSACTION) + log_debug("Cannot deserialize job of type: %s", v); + else + j->type = t; + + } else if (streq(l, "job-state")) { + JobState s; + + s = job_state_from_string(v); + if (s < 0) + log_debug("Failed to parse job state: %s", v); + else + job_set_state(j, s); + + } else if (streq(l, "job-irreversible")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job irreversible flag: %s", v); + else + j->irreversible = j->irreversible || b; + + } else if (streq(l, "job-sent-dbus-new-signal")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job sent_dbus_new_signal flag: %s", v); + else + j->sent_dbus_new_signal = j->sent_dbus_new_signal || b; + + } else if (streq(l, "job-ignore-order")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job ignore_order flag: %s", v); + else + j->ignore_order = j->ignore_order || b; + + } else if (streq(l, "job-begin")) + (void) deserialize_usec(v, &j->begin_usec); + + else if (streq(l, "job-begin-running")) + (void) deserialize_usec(v, &j->begin_running_usec); + + else if (streq(l, "subscribed")) { + if (strv_extend(&j->deserialized_clients, v) < 0) + return log_oom(); + + } else if (startswith(l, "activation-details")) { + if (activation_details_deserialize(l, v, &j->activation_details) < 0) + log_debug("Failed to parse job ActivationDetails element: %s", v); + + } else + log_debug("Unknown job serialization key: %s", l); + } + + return 0; +} + +int job_coldplug(Job *j) { + int r; + usec_t timeout_time = USEC_INFINITY; + + assert(j); + + /* After deserialization is complete and the bus connection + * set up again, let's start watching our subscribers again */ + (void) bus_job_coldplug_bus_track(j); + + if (j->state == JOB_WAITING) + job_add_to_run_queue(j); + + /* Maybe due to new dependencies we don't actually need this job anymore? */ + job_add_to_gc_queue(j); + + /* Create timer only when job began or began running and the respective timeout is finite. + * Follow logic of job_start_timer() if both timeouts are finite */ + if (j->begin_usec == 0) + return 0; + + if (j->unit->job_timeout != USEC_INFINITY) + timeout_time = usec_add(j->begin_usec, j->unit->job_timeout); + + if (timestamp_is_set(j->begin_running_usec)) + timeout_time = MIN(timeout_time, usec_add(j->begin_running_usec, j->unit->job_running_timeout)); + + if (timeout_time == USEC_INFINITY) + return 0; + + j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source); + + r = sd_event_add_time( + j->manager->event, + &j->timer_event_source, + CLOCK_MONOTONIC, + timeout_time, 0, + job_dispatch_timer, j); + if (r < 0) + log_debug_errno(r, "Failed to restart timeout for job: %m"); + + (void) sd_event_source_set_description(j->timer_event_source, "job-timeout"); + + return r; +} + +void job_shutdown_magic(Job *j) { + assert(j); + + /* The shutdown target gets some special treatment here: we + * tell the kernel to begin with flushing its disk caches, to + * optimize shutdown time a bit. Ideally we wouldn't hardcode + * this magic into PID 1. However all other processes aren't + * options either since they'd exit much sooner than PID 1 and + * asynchronous sync() would cause their exit to be + * delayed. */ + + if (j->type != JOB_START) + return; + + if (!MANAGER_IS_SYSTEM(j->unit->manager)) + return; + + if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET)) + return; + + /* In case messages on console has been disabled on boot */ + j->unit->manager->no_console_output = false; + + manager_invalidate_startup_units(j->unit->manager); + + if (detect_container() > 0) + return; + + (void) asynchronous_sync(NULL); +} + +int job_get_timeout(Job *j, usec_t *ret) { + usec_t x = USEC_INFINITY, y = USEC_INFINITY; + Unit *u = ASSERT_PTR(ASSERT_PTR(j)->unit); + int r; + + assert(ret); + + if (j->timer_event_source) { + r = sd_event_source_get_time(j->timer_event_source, &x); + if (r < 0) + return r; + } + + if (UNIT_VTABLE(u)->get_timeout) { + r = UNIT_VTABLE(u)->get_timeout(u, &y); + if (r < 0) + return r; + } + + if (x == USEC_INFINITY && y == USEC_INFINITY) { + *ret = 0; + return 0; + } + + *ret = MIN(x, y); + return 1; +} + +bool job_may_gc(Job *j) { + Unit *other; + + assert(j); + + /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their + * own and just track external state. For now the only unit type that qualifies for this are .device units. + * Returns true if the job can be collected. */ + + if (!UNIT_VTABLE(j->unit)->gc_jobs) + return false; + + /* Make sure to send out pending D-Bus events before we unload the unit */ + if (j->in_dbus_queue) + return false; + + if (sd_bus_track_count(j->bus_track) > 0) + return false; + + /* FIXME: So this is a bit ugly: for now we don't properly track references made via private bus connections + * (because it's nasty, as sd_bus_track doesn't apply to it). We simply remember that the job was once + * referenced by one, and reset this whenever we notice that no private bus connections are around. This means + * the GC is a bit too conservative when it comes to jobs created by private bus connections. */ + if (j->ref_by_private_bus) { + if (set_isempty(j->unit->manager->private_buses)) + j->ref_by_private_bus = false; + else + return false; + } + + if (j->type == JOB_NOP) + return false; + + /* The logic is inverse to job_is_runnable, we cannot GC as long as we block any job. */ + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) + if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) < 0) + return false; + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) + if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) < 0) + return false; + + return true; +} + +void job_add_to_gc_queue(Job *j) { + assert(j); + + if (j->in_gc_queue) + return; + + if (!job_may_gc(j)) + return; + + LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j); + j->in_gc_queue = true; +} + +static int job_compare_id(Job * const *a, Job * const *b) { + return CMP((*a)->id, (*b)->id); +} + +static size_t sort_job_list(Job **list, size_t n) { + Job *previous = NULL; + size_t a, b; + + /* Order by numeric IDs */ + typesafe_qsort(list, n, job_compare_id); + + /* Filter out duplicates */ + for (a = 0, b = 0; a < n; a++) { + + if (previous == list[a]) + continue; + + previous = list[b++] = list[a]; + } + + return b; +} + +int job_get_before(Job *j, Job*** ret) { + _cleanup_free_ Job** list = NULL; + Unit *other = NULL; + size_t n = 0; + + /* Returns a list of all pending jobs that need to finish before this job may be started. */ + + assert(j); + assert(ret); + + if (j->ignore_order) { + *ret = NULL; + return 0; + } + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) { + if (!other->job) + continue; + if (job_compare(j, other->job, UNIT_ATOM_AFTER) <= 0) + continue; + + if (!GREEDY_REALLOC(list, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) { + if (!other->job) + continue; + if (job_compare(j, other->job, UNIT_ATOM_BEFORE) <= 0) + continue; + + if (!GREEDY_REALLOC(list, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + n = sort_job_list(list, n); + + *ret = TAKE_PTR(list); + + return (int) n; +} + +int job_get_after(Job *j, Job*** ret) { + _cleanup_free_ Job** list = NULL; + Unit *other = NULL; + size_t n = 0; + + assert(j); + assert(ret); + + /* Returns a list of all pending jobs that are waiting for this job to finish. */ + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (job_compare(j, other->job, UNIT_ATOM_BEFORE) >= 0) + continue; + + if (!GREEDY_REALLOC(list, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (job_compare(j, other->job, UNIT_ATOM_AFTER) >= 0) + continue; + + if (!GREEDY_REALLOC(list, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + n = sort_job_list(list, n); + + *ret = TAKE_PTR(list); + + return (int) n; +} + +static const char* const job_state_table[_JOB_STATE_MAX] = { + [JOB_WAITING] = "waiting", + [JOB_RUNNING] = "running", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_state, JobState); + +static const char* const job_type_table[_JOB_TYPE_MAX] = { + [JOB_START] = "start", + [JOB_VERIFY_ACTIVE] = "verify-active", + [JOB_STOP] = "stop", + [JOB_RELOAD] = "reload", + [JOB_RELOAD_OR_START] = "reload-or-start", + [JOB_RESTART] = "restart", + [JOB_TRY_RESTART] = "try-restart", + [JOB_TRY_RELOAD] = "try-reload", + [JOB_NOP] = "nop", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_type, JobType); + +static const char* const job_mode_table[_JOB_MODE_MAX] = { + [JOB_FAIL] = "fail", + [JOB_REPLACE] = "replace", + [JOB_REPLACE_IRREVERSIBLY] = "replace-irreversibly", + [JOB_ISOLATE] = "isolate", + [JOB_FLUSH] = "flush", + [JOB_IGNORE_DEPENDENCIES] = "ignore-dependencies", + [JOB_IGNORE_REQUIREMENTS] = "ignore-requirements", + [JOB_TRIGGERING] = "triggering", + [JOB_RESTART_DEPENDENCIES] = "restart-dependencies", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode); + +static const char* const job_result_table[_JOB_RESULT_MAX] = { + [JOB_DONE] = "done", + [JOB_CANCELED] = "canceled", + [JOB_TIMEOUT] = "timeout", + [JOB_FAILED] = "failed", + [JOB_DEPENDENCY] = "dependency", + [JOB_SKIPPED] = "skipped", + [JOB_INVALID] = "invalid", + [JOB_ASSERT] = "assert", + [JOB_UNSUPPORTED] = "unsupported", + [JOB_COLLECTED] = "collected", + [JOB_ONCE] = "once", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult); + +const char* job_type_to_access_method(JobType t) { + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (IN_SET(t, JOB_START, JOB_RESTART, JOB_TRY_RESTART)) + return "start"; + else if (t == JOB_STOP) + return "stop"; + else + return "reload"; +} + +/* + * assume_dep assumed dependency between units (a is before/after b) + * + * Returns + * 0 jobs are independent, + * >0 a should run after b, + * <0 a should run before b, + * + * The logic means that for a service a and a service b where b.After=a: + * + * start a + start b → 1st step start a, 2nd step start b + * start a + stop b → 1st step stop b, 2nd step start a + * stop a + start b → 1st step stop a, 2nd step start b + * stop a + stop b → 1st step stop b, 2nd step stop a + * + * This has the side effect that restarts are properly synchronized too. + */ +int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep) { + assert(a); + assert(b); + assert(a->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(b->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(IN_SET(assume_dep, UNIT_ATOM_AFTER, UNIT_ATOM_BEFORE)); + + /* Trivial cases first */ + if (a->type == JOB_NOP || b->type == JOB_NOP) + return 0; + + if (a->ignore_order || b->ignore_order) + return 0; + + if (assume_dep == UNIT_ATOM_AFTER) + return -job_compare(b, a, UNIT_ATOM_BEFORE); + + /* Let's make it simple, JOB_STOP goes always first (in case both ua and ub stop, then ub's stop goes + * first anyway). JOB_RESTART is JOB_STOP in disguise (before it is patched to JOB_START). */ + if (IN_SET(b->type, JOB_STOP, JOB_RESTART)) + return 1; + else + return -1; +} + +void job_set_activation_details(Job *j, ActivationDetails *info) { + /* Existing (older) ActivationDetails win, newer ones are discarded. */ + if (!j || j->activation_details || !info) + return; /* Nothing to do. */ + + j->activation_details = activation_details_ref(info); +} diff --git a/src/core/job.h b/src/core/job.h new file mode 100644 index 0000000..891d87a --- /dev/null +++ b/src/core/job.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-event.h" + +#include "list.h" +#include "unit-dependency-atom.h" +#include "unit-name.h" +#include "unit.h" + +typedef struct ActivationDetails ActivationDetails; +typedef struct Job Job; +typedef struct JobDependency JobDependency; +typedef enum JobType JobType; +typedef enum JobState JobState; +typedef enum JobMode JobMode; +typedef enum JobResult JobResult; + +/* Be careful when changing the job types! Adjust job_merging_table[] accordingly! */ +enum JobType { + JOB_START, /* if a unit does not support being started, we'll just wait until it becomes active */ + JOB_VERIFY_ACTIVE, + + JOB_STOP, + + JOB_RELOAD, /* if running, reload */ + + /* Note that restarts are first treated like JOB_STOP, but + * then instead of finishing are patched to become + * JOB_START. */ + JOB_RESTART, /* If running, stop. Then start unconditionally. */ + + _JOB_TYPE_MAX_MERGING, + + /* JOB_NOP can enter into a transaction, but as it won't pull in + * any dependencies and it uses the special 'nop_job' slot in Unit, + * it won't have to merge with anything (except possibly into another + * JOB_NOP, previously installed). JOB_NOP is special-cased in + * job_type_is_*() functions so that the transaction can be + * activated. */ + JOB_NOP = _JOB_TYPE_MAX_MERGING, /* do nothing */ + + _JOB_TYPE_MAX_IN_TRANSACTION, + + /* JOB_TRY_RESTART can never appear in a transaction, because + * it always collapses into JOB_RESTART or JOB_NOP before entering. + * Thus we never need to merge it with anything. */ + JOB_TRY_RESTART = _JOB_TYPE_MAX_IN_TRANSACTION, /* if running, stop and then start */ + + /* Similar to JOB_TRY_RESTART but collapses to JOB_RELOAD or JOB_NOP */ + JOB_TRY_RELOAD, + + /* JOB_RELOAD_OR_START won't enter into a transaction and cannot result + * from transaction merging (there's no way for JOB_RELOAD and + * JOB_START to meet in one transaction). It can result from a merge + * during job installation, but then it will immediately collapse into + * one of the two simpler types. */ + JOB_RELOAD_OR_START, /* if running, reload, otherwise start */ + + _JOB_TYPE_MAX, + _JOB_TYPE_INVALID = -EINVAL, +}; + +enum JobState { + JOB_WAITING, + JOB_RUNNING, + _JOB_STATE_MAX, + _JOB_STATE_INVALID = -EINVAL, +}; + +enum JobMode { + JOB_FAIL, /* Fail if a conflicting job is already queued */ + JOB_REPLACE, /* Replace an existing conflicting job */ + JOB_REPLACE_IRREVERSIBLY,/* Like JOB_REPLACE + produce irreversible jobs */ + JOB_ISOLATE, /* Start a unit, and stop all others */ + JOB_FLUSH, /* Flush out all other queued jobs when queueing this one */ + JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */ + JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */ + JOB_TRIGGERING, /* Adds TRIGGERED_BY dependencies to the same transaction */ + JOB_RESTART_DEPENDENCIES,/* A "start" job for the specified unit becomes "restart" for depending units */ + _JOB_MODE_MAX, + _JOB_MODE_INVALID = -EINVAL, +}; + +enum JobResult { + JOB_DONE, /* Job completed successfully (or skipped due to an unmet ConditionXYZ=) */ + JOB_CANCELED, /* Job canceled by a conflicting job installation or by explicit cancel request */ + JOB_TIMEOUT, /* Job timeout elapsed */ + JOB_FAILED, /* Job failed */ + JOB_DEPENDENCY, /* A required dependency job did not result in JOB_DONE */ + JOB_SKIPPED, /* Negative result of JOB_VERIFY_ACTIVE or skip due to ExecCondition= */ + JOB_INVALID, /* JOB_RELOAD of inactive unit */ + JOB_ASSERT, /* Couldn't start a unit, because an assert didn't hold */ + JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */ + JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */ + JOB_ONCE, /* Unit was started before, and hence can't be started again */ + _JOB_RESULT_MAX, + _JOB_RESULT_INVALID = -EINVAL, +}; + +struct JobDependency { + /* Encodes that the 'subject' job needs the 'object' job in + * some way. This structure is used only while building a transaction. */ + Job *subject; + Job *object; + + LIST_FIELDS(JobDependency, subject); + LIST_FIELDS(JobDependency, object); + + bool matters:1; + bool conflicts:1; +}; + +struct Job { + Manager *manager; + Unit *unit; + + LIST_FIELDS(Job, transaction); + LIST_FIELDS(Job, dbus_queue); + LIST_FIELDS(Job, gc_queue); + + LIST_HEAD(JobDependency, subject_list); + LIST_HEAD(JobDependency, object_list); + + /* Used for graph algs as a "I have been here" marker */ + Job* marker; + unsigned generation; + + uint32_t id; + + JobType type; + JobState state; + + sd_event_source *timer_event_source; + usec_t begin_usec; + usec_t begin_running_usec; + + /* + * This tracks where to send signals, and also which clients + * are allowed to call DBus methods on the job (other than + * root). + * + * There can be more than one client, because of job merging. + */ + sd_bus_track *bus_track; + char **deserialized_clients; + + JobResult result; + + unsigned run_queue_idx; + + /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */ + ActivationDetails *activation_details; + + bool installed:1; + bool in_run_queue:1; + bool matters_to_anchor:1; + bool in_dbus_queue:1; + bool sent_dbus_new_signal:1; + bool ignore_order:1; + bool irreversible:1; + bool in_gc_queue:1; + bool ref_by_private_bus:1; +}; + +Job* job_new(Unit *unit, JobType type); +Job* job_new_raw(Unit *unit); +void job_unlink(Job *job); +Job* job_free(Job *job); +Job* job_install(Job *j, bool refuse_late_merge); +int job_install_deserialized(Job *j); +void job_uninstall(Job *j); +void job_dump(Job *j, FILE *f, const char *prefix); +int job_serialize(Job *j, FILE *f); +int job_deserialize(Job *j, FILE *f); +int job_coldplug(Job *j); + +JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts); +void job_dependency_free(JobDependency *l); + +int job_merge(Job *j, Job *other); + +JobType job_type_lookup_merge(JobType a, JobType b) _pure_; + +_pure_ static inline bool job_type_is_mergeable(JobType a, JobType b) { + return job_type_lookup_merge(a, b) >= 0; +} + +_pure_ static inline bool job_type_is_conflicting(JobType a, JobType b) { + return a != JOB_NOP && b != JOB_NOP && !job_type_is_mergeable(a, b); +} + +_pure_ static inline bool job_type_is_superset(JobType a, JobType b) { + /* Checks whether operation a is a "superset" of b in its actions */ + if (b == JOB_NOP) + return true; + if (a == JOB_NOP) + return false; + return a == job_type_lookup_merge(a, b); +} + +bool job_type_is_redundant(JobType a, UnitActiveState b) _pure_; + +/* Collapses a state-dependent job type into a simpler type by observing + * the state of the unit which it is going to be applied to. */ +JobType job_type_collapse(JobType t, Unit *u); + +int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u); + +void job_add_to_run_queue(Job *j); +void job_add_to_dbus_queue(Job *j); + +int job_start_timer(Job *j, bool job_running); + +int job_run_and_invalidate(Job *j); +int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already); + +char *job_dbus_path(Job *j); + +void job_shutdown_magic(Job *j); + +int job_get_timeout(Job *j, usec_t *ret); + +bool job_may_gc(Job *j); +void job_add_to_gc_queue(Job *j); + +int job_get_before(Job *j, Job*** ret); +int job_get_after(Job *j, Job*** ret); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Job*, job_free); + +const char* job_type_to_string(JobType t) _const_; +JobType job_type_from_string(const char *s) _pure_; + +const char* job_state_to_string(JobState t) _const_; +JobState job_state_from_string(const char *s) _pure_; + +const char* job_mode_to_string(JobMode t) _const_; +JobMode job_mode_from_string(const char *s) _pure_; + +const char* job_result_to_string(JobResult t) _const_; +JobResult job_result_from_string(const char *s) _pure_; + +const char* job_type_to_access_method(JobType t); + +int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep); + +void job_set_activation_details(Job *j, ActivationDetails *info); diff --git a/src/core/kill.c b/src/core/kill.c new file mode 100644 index 0000000..c8b581d --- /dev/null +++ b/src/core/kill.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "kill.h" +#include "signal-util.h" +#include "string-table.h" + +void kill_context_init(KillContext *c) { + assert(c); + + c->kill_signal = SIGTERM; + /* restart_kill_signal is unset by default and we fall back to kill_signal */ + c->final_kill_signal = SIGKILL; + c->send_sigkill = true; + c->send_sighup = false; + c->watchdog_signal = SIGABRT; +} + +void kill_context_dump(KillContext *c, FILE *f, const char *prefix) { + assert(c); + + prefix = strempty(prefix); + + fprintf(f, + "%sKillMode: %s\n" + "%sKillSignal: SIG%s\n" + "%sRestartKillSignal: SIG%s\n" + "%sFinalKillSignal: SIG%s\n" + "%sSendSIGKILL: %s\n" + "%sSendSIGHUP: %s\n", + prefix, kill_mode_to_string(c->kill_mode), + prefix, signal_to_string(c->kill_signal), + prefix, signal_to_string(restart_kill_signal(c)), + prefix, signal_to_string(c->final_kill_signal), + prefix, yes_no(c->send_sigkill), + prefix, yes_no(c->send_sighup)); +} + +static const char* const kill_mode_table[_KILL_MODE_MAX] = { + [KILL_CONTROL_GROUP] = "control-group", + [KILL_PROCESS] = "process", + [KILL_MIXED] = "mixed", + [KILL_NONE] = "none", +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_mode, KillMode); + +static const char* const kill_who_table[_KILL_WHO_MAX] = { + [KILL_MAIN] = "main", + [KILL_CONTROL] = "control", + [KILL_ALL] = "all", + [KILL_MAIN_FAIL] = "main-fail", + [KILL_CONTROL_FAIL] = "control-fail", + [KILL_ALL_FAIL] = "all-fail", +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho); diff --git a/src/core/kill.h b/src/core/kill.h new file mode 100644 index 0000000..dbf884d --- /dev/null +++ b/src/core/kill.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct KillContext KillContext; + +#include +#include + +#include "macro.h" + +typedef enum KillMode { + /* The kill mode is a property of a unit. */ + KILL_CONTROL_GROUP = 0, + KILL_PROCESS, + KILL_MIXED, + KILL_NONE, + _KILL_MODE_MAX, + _KILL_MODE_INVALID = -EINVAL, +} KillMode; + +struct KillContext { + KillMode kill_mode; + int kill_signal; + int restart_kill_signal; + int final_kill_signal; + int watchdog_signal; + bool send_sigkill; + bool send_sighup; +}; + +typedef enum KillWho { + /* Kill who is a property of an operation */ + KILL_MAIN, + KILL_CONTROL, + KILL_ALL, + KILL_MAIN_FAIL, + KILL_CONTROL_FAIL, + KILL_ALL_FAIL, + _KILL_WHO_MAX, + _KILL_WHO_INVALID = -EINVAL, +} KillWho; + +void kill_context_init(KillContext *c); +void kill_context_dump(KillContext *c, FILE *f, const char *prefix); + +const char *kill_mode_to_string(KillMode k) _const_; +KillMode kill_mode_from_string(const char *s) _pure_; + +const char *kill_who_to_string(KillWho k) _const_; +KillWho kill_who_from_string(const char *s) _pure_; + +static inline int restart_kill_signal(const KillContext *c) { + if (c->restart_kill_signal != 0) + return c->restart_kill_signal; + return c->kill_signal; +} diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c new file mode 100644 index 0000000..b8e3f7a --- /dev/null +++ b/src/core/kmod-setup.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-util.h" +#include "capability-util.h" +#include "efi-api.h" +#include "fileio.h" +#include "kmod-setup.h" +#include "macro.h" +#include "recurse-dir.h" +#include "string-util.h" +#include "strv.h" +#include "virt.h" + +#if HAVE_KMOD +#include "module-util.h" + +static void systemd_kmod_log( + void *data, + int priority, + const char *file, int line, + const char *fn, + const char *format, + va_list args) { + + /* library logging is enabled at debug only */ + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_DEBUG, 0, file, line, fn, format, args); + REENABLE_WARNING; +} + +static int match_modalias_recurse_dir_cb( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + _cleanup_free_ char *alias = NULL; + char **modaliases = ASSERT_PTR(userdata); + int r; + + if (event != RECURSE_DIR_ENTRY) + return RECURSE_DIR_CONTINUE; + + if (de->d_type != DT_REG) + return RECURSE_DIR_CONTINUE; + + if (!streq(de->d_name, "modalias")) + return RECURSE_DIR_CONTINUE; + + r = read_one_line_file(path, &alias); + if (r < 0) { + log_debug_errno(r, "Failed to read %s, ignoring: %m", path); + return RECURSE_DIR_LEAVE_DIRECTORY; + } + + if (startswith_strv(alias, modaliases)) + return 1; + + return RECURSE_DIR_LEAVE_DIRECTORY; +} + +static bool has_virtio_feature(const char *name, char **modaliases) { + int r; + + /* Directory traversal might be slow, hence let's do a cheap check first if it's even worth it */ + if (detect_vm() == VIRTUALIZATION_NONE) + return false; + + r = recurse_dir_at( + AT_FDCWD, + "/sys/devices/pci0000:00", + /* statx_mask= */ 0, + /* n_depth_max= */ 3, + RECURSE_DIR_ENSURE_TYPE, + match_modalias_recurse_dir_cb, + modaliases); + if (r < 0) + log_debug_errno(r, "Failed to determine whether host has %s device, ignoring: %m", name); + + return r > 0; +} + +static bool has_virtio_rng(void) { + return has_virtio_feature("virtio-rng", STRV_MAKE("pci:v00001AF4d00001005", "pci:v00001AF4d00001044")); +} + +static bool has_virtio_console(void) { + return has_virtio_feature("virtio-console", STRV_MAKE("virtio:d00000003v", "virtio:d0000000Bv")); +} + +static bool has_virtio_vsock(void) { + return has_virtio_feature("virtio-vsock", STRV_MAKE("virtio:d00000013v")); +} + +static bool has_virtiofs(void) { + return has_virtio_feature("virtiofs", STRV_MAKE("virtio:d0000001Av")); +} + +static bool has_virtio_pci(void) { + return has_virtio_feature("virtio-pci", STRV_MAKE("pci:v00001AF4d")); +} + +static bool in_qemu(void) { + return IN_SET(detect_vm(), VIRTUALIZATION_KVM, VIRTUALIZATION_QEMU); +} +#endif + +int kmod_setup(void) { +#if HAVE_KMOD + + static const struct { + const char *module; + const char *path; + bool warn_if_unavailable:1; + bool warn_if_module:1; + bool (*condition_fn)(void); + } kmod_table[] = { + /* This one we need to load explicitly, since auto-loading on use doesn't work + * before udev created the ghost device nodes, and we need it earlier than that. */ + { "autofs4", "/sys/class/misc/autofs", true, false, NULL }, + + /* This one we need to load explicitly, since auto-loading of IPv6 is not done when + * we try to configure ::1 on the loopback device. */ + { "ipv6", "/sys/module/ipv6", false, true, NULL }, + + /* This should never be a module */ + { "unix", "/proc/net/unix", true, true, NULL }, + +#if HAVE_LIBIPTC + /* netfilter is needed by networkd, nspawn among others, and cannot be autoloaded */ + { "ip_tables", "/proc/net/ip_tables_names", false, false, NULL }, +#endif + /* virtio_rng would be loaded by udev later, but real entropy might be needed very early */ + { "virtio_rng", NULL, false, false, has_virtio_rng }, + + /* we want early logging to hvc consoles if possible, and make sure systemd-getty-generator + * can rely on all consoles being probed already.*/ + { "virtio_console", NULL, false, false, has_virtio_console }, + + /* Make sure we can send sd-notify messages over vsock as early as possible. */ + { "vmw_vsock_virtio_transport", NULL, false, false, has_virtio_vsock }, + + /* We can't wait for specific virtiofs tags to show up as device nodes so we have to load the + * virtiofs and virtio_pci modules early to make sure the virtiofs tags are found when + * sysroot.mount is started. + * + * TODO: Remove these again once https://gitlab.com/virtio-fs/virtiofsd/-/issues/128 is + * resolved and the kernel fix is widely available. */ + { "virtiofs", "/sys/module/virtiofs", false, false, has_virtiofs }, + { "virtio_pci", "/sys/module/virtio_pci", false, false, has_virtio_pci }, + + /* qemu_fw_cfg would be loaded by udev later, but we want to import credentials from it super early */ + { "qemu_fw_cfg", "/sys/firmware/qemu_fw_cfg", false, false, in_qemu }, + + /* dmi-sysfs is needed to import credentials from it super early */ + { "dmi-sysfs", "/sys/firmware/dmi/entries", false, false, NULL }, + +#if HAVE_TPM2 + /* Make sure the tpm subsystem is available which ConditionSecurity=tpm2 depends on. */ + { "tpm", "/sys/class/tpmrm", false, false, efi_has_tpm2 }, +#endif + }; + _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL; + unsigned i; + + if (have_effective_cap(CAP_SYS_MODULE) <= 0) + return 0; + + for (i = 0; i < ELEMENTSOF(kmod_table); i++) { + if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0) + continue; + + if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn()) + continue; + + if (kmod_table[i].warn_if_module) + log_debug("Your kernel apparently lacks built-in %s support. Might be " + "a good idea to compile it in. We'll now try to work around " + "this by loading the module...", kmod_table[i].module); + + if (!ctx) { + ctx = kmod_new(NULL, NULL); + if (!ctx) + return log_oom(); + + kmod_set_log_fn(ctx, systemd_kmod_log, NULL); + kmod_load_resources(ctx); + } + + (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable); + } + +#endif + return 0; +} diff --git a/src/core/kmod-setup.h b/src/core/kmod-setup.h new file mode 100644 index 0000000..1c842d3 --- /dev/null +++ b/src/core/kmod-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int kmod_setup(void); diff --git a/src/core/load-dropin.c b/src/core/load-dropin.c new file mode 100644 index 0000000..fd45744 --- /dev/null +++ b/src/core/load-dropin.c @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "fs-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static int process_deps(Unit *u, UnitDependency dependency, const char *dir_suffix) { + _cleanup_strv_free_ char **paths = NULL; + int r; + + r = unit_file_find_dropin_paths(NULL, + u->manager->lookup_paths.search_path, + u->manager->unit_path_cache, + dir_suffix, NULL, + u->id, u->aliases, + &paths); + if (r < 0) + return r; + + STRV_FOREACH(p, paths) { + _cleanup_free_ char *target = NULL; + const char *entry; + + entry = basename(*p); + + if (null_or_empty_path(*p) > 0) { + /* an error usually means an invalid symlink, which is not a mask */ + log_unit_debug(u, "%s dependency on %s is masked by %s, ignoring.", + unit_dependency_to_string(dependency), entry, *p); + continue; + } + + r = is_symlink(*p); + if (r < 0) { + log_unit_warning_errno(u, r, "%s dropin %s unreadable, ignoring: %m", + unit_dependency_to_string(dependency), *p); + continue; + } + if (r == 0) { + log_unit_warning(u, "%s dependency dropin %s is not a symlink, ignoring.", + unit_dependency_to_string(dependency), *p); + continue; + } + + if (!unit_name_is_valid(entry, UNIT_NAME_ANY)) { + log_unit_warning(u, "%s dependency dropin %s is not a valid unit name, ignoring.", + unit_dependency_to_string(dependency), *p); + continue; + } + + r = readlink_malloc(*p, &target); + if (r < 0) { + log_unit_warning_errno(u, r, "readlink(\"%s\") failed, ignoring: %m", *p); + continue; + } + + /* We don't treat this as an error, especially because we didn't check this for a + * long time. Nevertheless, we warn, because such mismatch can be mighty confusing. */ + r = unit_symlink_name_compatible(entry, basename(target), u->instance); + if (r < 0) { + log_unit_warning_errno(u, r, "Can't check if names %s and %s are compatible, ignoring: %m", + entry, basename(target)); + continue; + } + if (r == 0) + log_unit_warning(u, "%s dependency dropin %s target %s has different name", + unit_dependency_to_string(dependency), *p, target); + + r = unit_add_dependency_by_name(u, dependency, entry, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add %s dependency on %s, ignoring: %m", + unit_dependency_to_string(dependency), entry); + } + + return 0; +} + +int unit_load_dropin(Unit *u) { + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(u); + + /* Load dependencies from .wants, .requires and .upholds directories */ + r = process_deps(u, UNIT_WANTS, ".wants"); + if (r < 0) + return r; + + r = process_deps(u, UNIT_REQUIRES, ".requires"); + if (r < 0) + return r; + + r = process_deps(u, UNIT_UPHOLDS, ".upholds"); + if (r < 0) + return r; + + /* Load .conf dropins */ + r = unit_find_dropin_paths(u, &l); + if (r <= 0) + return 0; + + if (!u->dropin_paths) + u->dropin_paths = TAKE_PTR(l); + else { + r = strv_extend_strv(&u->dropin_paths, l, true); + if (r < 0) + return log_oom(); + } + + u->dropin_mtime = 0; + STRV_FOREACH(f, u->dropin_paths) { + struct stat st; + + r = config_parse(u->id, *f, NULL, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + 0, u, &st); + if (r > 0) + u->dropin_mtime = MAX(u->dropin_mtime, timespec_load(&st.st_mtim)); + } + + return 0; +} diff --git a/src/core/load-dropin.h b/src/core/load-dropin.h new file mode 100644 index 0000000..f0b87d3 --- /dev/null +++ b/src/core/load-dropin.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "dropin.h" +#include "unit.h" + +/* Read service data supplementary drop-in directories */ + +static inline int unit_find_dropin_paths(Unit *u, char ***paths) { + assert(u); + + return unit_file_find_dropin_paths(NULL, + u->manager->lookup_paths.search_path, + u->manager->unit_path_cache, + ".d", ".conf", + u->id, u->aliases, + paths); +} + +int unit_load_dropin(Unit *u); diff --git a/src/core/load-fragment-gperf-nulstr.awk b/src/core/load-fragment-gperf-nulstr.awk new file mode 100644 index 0000000..a1b7d1c --- /dev/null +++ b/src/core/load-fragment-gperf-nulstr.awk @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + keywords=0 ; FS="," ; + print "extern const char load_fragment_gperf_nulstr[];" ; + print "const char load_fragment_gperf_nulstr[] =" +} +keyword==1 { + print "\"" $1 "\\0\"" +} +/%%/ { + keyword=1 +} +END { + print ";" +} diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in new file mode 100644 index 0000000..45f9ab0 --- /dev/null +++ b/src/core/load-fragment-gperf.gperf.in @@ -0,0 +1,595 @@ +{# SPDX-License-Identifier: LGPL-2.1-or-later #} + +{%- macro EXEC_CONTEXT_CONFIG_ITEMS(type) -%} +{# Define the context options only once #} +{{type}}.WorkingDirectory, config_parse_working_directory, 0, offsetof({{type}}, exec_context) +{{type}}.RootDirectory, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_directory) +{{type}}.RootImage, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_image) +{{type}}.RootImageOptions, config_parse_root_image_options, 0, offsetof({{type}}, exec_context) +{{type}}.RootImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.root_image_policy) +{{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context) +{{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context) +{{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity) +{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral) +{{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories) +{{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context) +{{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy) +{{type}}.MountImages, config_parse_mount_images, 0, offsetof({{type}}, exec_context) +{{type}}.MountImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.mount_image_policy) +{{type}}.User, config_parse_user_group_compat, 0, offsetof({{type}}, exec_context.user) +{{type}}.Group, config_parse_user_group_compat, 0, offsetof({{type}}, exec_context.group) +{{type}}.SupplementaryGroups, config_parse_user_group_strv_compat, 0, offsetof({{type}}, exec_context.supplementary_groups) +{{type}}.SetLoginEnvironment, config_parse_tristate, 0, offsetof({{type}}, exec_context.set_login_environment) +{{type}}.Nice, config_parse_exec_nice, 0, offsetof({{type}}, exec_context) +{{type}}.OOMScoreAdjust, config_parse_exec_oom_score_adjust, 0, offsetof({{type}}, exec_context) +{{type}}.CoredumpFilter, config_parse_exec_coredump_filter, 0, offsetof({{type}}, exec_context) +{{type}}.IOSchedulingClass, config_parse_exec_io_class, 0, offsetof({{type}}, exec_context) +{{type}}.IOSchedulingPriority, config_parse_exec_io_priority, 0, offsetof({{type}}, exec_context) +{{type}}.CPUSchedulingPolicy, config_parse_exec_cpu_sched_policy, 0, offsetof({{type}}, exec_context) +{{type}}.CPUSchedulingPriority, config_parse_exec_cpu_sched_prio, 0, offsetof({{type}}, exec_context) +{{type}}.CPUSchedulingResetOnFork, config_parse_bool, 0, offsetof({{type}}, exec_context.cpu_sched_reset_on_fork) +{{type}}.CPUAffinity, config_parse_exec_cpu_affinity, 0, offsetof({{type}}, exec_context) +{{type}}.NUMAPolicy, config_parse_numa_policy, 0, offsetof({{type}}, exec_context.numa_policy.type) +{{type}}.NUMAMask, config_parse_numa_mask, 0, offsetof({{type}}, exec_context.numa_policy) +{{type}}.UMask, config_parse_mode, 0, offsetof({{type}}, exec_context.umask) +{{type}}.Environment, config_parse_environ, 0, offsetof({{type}}, exec_context.environment) +{{type}}.EnvironmentFile, config_parse_unit_env_file, 0, offsetof({{type}}, exec_context.environment_files) +{{type}}.PassEnvironment, config_parse_pass_environ, 0, offsetof({{type}}, exec_context.pass_environment) +{{type}}.UnsetEnvironment, config_parse_unset_environ, 0, offsetof({{type}}, exec_context.unset_environment) +{{type}}.DynamicUser, config_parse_bool, true, offsetof({{type}}, exec_context.dynamic_user) +{{type}}.RemoveIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.remove_ipc) +{{type}}.StandardInput, config_parse_exec_input, 0, offsetof({{type}}, exec_context) +{{type}}.StandardOutput, config_parse_exec_output, 0, offsetof({{type}}, exec_context) +{{type}}.StandardError, config_parse_exec_output, 0, offsetof({{type}}, exec_context) +{{type}}.StandardInputText, config_parse_exec_input_text, 0, offsetof({{type}}, exec_context) +{{type}}.StandardInputData, config_parse_exec_input_data, 0, offsetof({{type}}, exec_context) +{{type}}.TTYPath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.tty_path) +{{type}}.TTYReset, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_reset) +{{type}}.TTYVHangup, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_vhangup) +{{type}}.TTYVTDisallocate, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_vt_disallocate) +{{type}}.TTYRows, config_parse_tty_size, 0, offsetof({{type}}, exec_context.tty_rows) +{{type}}.TTYColumns, config_parse_tty_size, 0, offsetof({{type}}, exec_context.tty_cols) +{{type}}.SyslogIdentifier, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.syslog_identifier) +{{type}}.SyslogFacility, config_parse_log_facility, 0, offsetof({{type}}, exec_context.syslog_priority) +{{type}}.SyslogLevel, config_parse_log_level, 0, offsetof({{type}}, exec_context.syslog_priority) +{{type}}.SyslogLevelPrefix, config_parse_bool, 0, offsetof({{type}}, exec_context.syslog_level_prefix) +{{type}}.LogLevelMax, config_parse_log_level, 0, offsetof({{type}}, exec_context.log_level_max) +{{type}}.LogRateLimitIntervalSec, config_parse_sec, 0, offsetof({{type}}, exec_context.log_ratelimit_interval_usec) +{{type}}.LogRateLimitBurst, config_parse_unsigned, 0, offsetof({{type}}, exec_context.log_ratelimit_burst) +{{type}}.LogExtraFields, config_parse_log_extra_fields, 0, offsetof({{type}}, exec_context) +{{type}}.LogFilterPatterns, config_parse_log_filter_patterns, 0, offsetof({{type}}, exec_context) +{{type}}.Capabilities, config_parse_warn_compat, DISABLED_LEGACY, offsetof({{type}}, exec_context) +{{type}}.SecureBits, config_parse_exec_secure_bits, 0, offsetof({{type}}, exec_context.secure_bits) +{{type}}.CapabilityBoundingSet, config_parse_capability_set, 0, offsetof({{type}}, exec_context.capability_bounding_set) +{{type}}.AmbientCapabilities, config_parse_capability_set, 0, offsetof({{type}}, exec_context.capability_ambient_set) +{{type}}.TimerSlackNSec, config_parse_nsec, 0, offsetof({{type}}, exec_context.timer_slack_nsec) +{{type}}.NoNewPrivileges, config_parse_bool, 0, offsetof({{type}}, exec_context.no_new_privileges) +{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode) +{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc) +{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset) +{% if HAVE_SECCOMP %} +{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context) +{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs) +{{type}}.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof({{type}}, exec_context) +{{type}}.SystemCallLog, config_parse_syscall_log, 0, offsetof({{type}}, exec_context) +{{type}}.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof({{type}}, exec_context.memory_deny_write_execute) +{{type}}.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof({{type}}, exec_context) +{{type}}.RestrictRealtime, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_realtime) +{{type}}.RestrictSUIDSGID, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_suid_sgid) +{{type}}.RestrictAddressFamilies, config_parse_address_families, 0, offsetof({{type}}, exec_context) +{{type}}.LockPersonality, config_parse_bool, 0, offsetof({{type}}, exec_context.lock_personality) +{% else %} +{{type}}.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.SystemCallLog, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.RestrictSUIDSGID, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{{type}}.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{{type}}.RestrictFileSystems, config_parse_restrict_filesystems, 0, offsetof({{type}}, exec_context) +{{type}}.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof({{type}}, exec_context.rlimit) +{{type}}.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof({{type}}, exec_context.rlimit) +{{type}}.ReadWriteDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_write_paths) +{{type}}.ReadOnlyDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_only_paths) +{{type}}.InaccessibleDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.inaccessible_paths) +{{type}}.ReadWritePaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_write_paths) +{{type}}.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_only_paths) +{{type}}.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.inaccessible_paths) +{{type}}.ExecPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.exec_paths) +{{type}}.NoExecPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.no_exec_paths) +{{type}}.ExecSearchPath, config_parse_colon_separated_paths, 0, offsetof({{type}}, exec_context.exec_search_path) +{{type}}.BindPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context) +{{type}}.BindReadOnlyPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context) +{{type}}.TemporaryFileSystem, config_parse_temporary_filesystems, 0, offsetof({{type}}, exec_context) +{{type}}.PrivateTmp, config_parse_bool, 0, offsetof({{type}}, exec_context.private_tmp) +{{type}}.PrivateDevices, config_parse_bool, 0, offsetof({{type}}, exec_context.private_devices) +{{type}}.ProtectKernelTunables, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_tunables) +{{type}}.ProtectKernelModules, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_modules) +{{type}}.ProtectKernelLogs, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_logs) +{{type}}.ProtectClock, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_clock) +{{type}}.ProtectControlGroups, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_control_groups) +{{type}}.NetworkNamespacePath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.network_namespace_path) +{{type}}.IPCNamespacePath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.ipc_namespace_path) +{{type}}.LogNamespace, config_parse_log_namespace, 0, offsetof({{type}}, exec_context) +{{type}}.PrivateNetwork, config_parse_bool, 0, offsetof({{type}}, exec_context.private_network) +{{type}}.PrivateUsers, config_parse_bool, 0, offsetof({{type}}, exec_context.private_users) +{{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts) +{{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc) +{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system) +{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home) +{{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag) +{{type}}.MountAPIVFS, config_parse_exec_mount_apivfs, 0, offsetof({{type}}, exec_context) +{{type}}.Personality, config_parse_personality, 0, offsetof({{type}}, exec_context.personality) +{{type}}.RuntimeDirectoryPreserve, config_parse_exec_preserve_mode, 0, offsetof({{type}}, exec_context.runtime_directory_preserve_mode) +{{type}}.RuntimeDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode) +{{type}}.RuntimeDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME]) +{{type}}.StateDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE].mode) +{{type}}.StateDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE]) +{{type}}.CacheDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE].mode) +{{type}}.CacheDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE]) +{{type}}.LogsDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS].mode) +{{type}}.LogsDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS]) +{{type}}.ConfigurationDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].mode) +{{type}}.ConfigurationDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION]) +{{type}}.SetCredential, config_parse_set_credential, 0, offsetof({{type}}, exec_context) +{{type}}.SetCredentialEncrypted, config_parse_set_credential, 1, offsetof({{type}}, exec_context) +{{type}}.LoadCredential, config_parse_load_credential, 0, offsetof({{type}}, exec_context) +{{type}}.LoadCredentialEncrypted, config_parse_load_credential, 1, offsetof({{type}}, exec_context) +{{type}}.ImportCredential, config_parse_import_credential, 0, offsetof({{type}}, exec_context.import_credentials) +{{type}}.TimeoutCleanSec, config_parse_sec, 0, offsetof({{type}}, exec_context.timeout_clean_usec) +{% if HAVE_PAM %} +{{type}}.PAMName, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.pam_name) +{% else %} +{{type}}.PAMName, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{{type}}.IgnoreSIGPIPE, config_parse_bool, 0, offsetof({{type}}, exec_context.ignore_sigpipe) +{{type}}.UtmpIdentifier, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.utmp_id) +{{type}}.UtmpMode, config_parse_exec_utmp_mode, 0, offsetof({{type}}, exec_context.utmp_mode) +{% if HAVE_SELINUX %} +{{type}}.SELinuxContext, config_parse_exec_selinux_context, 0, offsetof({{type}}, exec_context) +{% else %} +{{type}}.SELinuxContext, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{% if HAVE_APPARMOR %} +{{type}}.AppArmorProfile, config_parse_exec_apparmor_profile, 0, offsetof({{type}}, exec_context) +{% else %} +{{type}}.AppArmorProfile, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{% if ENABLE_SMACK %} +{{type}}.SmackProcessLabel, config_parse_exec_smack_process_label, 0, offsetof({{type}}, exec_context) +{% else %} +{{type}}.SmackProcessLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{{type}}.ProtectHostname, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_hostname) +{{type}}.MemoryKSM, config_parse_tristate, 0, offsetof({{type}}, exec_context.memory_ksm) +{%- endmacro -%} + +{%- macro KILL_CONTEXT_CONFIG_ITEMS(type) -%} +{{type}}.SendSIGKILL, config_parse_bool, 0, offsetof({{type}}, kill_context.send_sigkill) +{{type}}.SendSIGHUP, config_parse_bool, 0, offsetof({{type}}, kill_context.send_sighup) +{{type}}.KillMode, config_parse_kill_mode, 0, offsetof({{type}}, kill_context.kill_mode) +{{type}}.KillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.kill_signal) +{{type}}.RestartKillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.restart_kill_signal) +{{type}}.FinalKillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.final_kill_signal) +{{type}}.WatchdogSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.watchdog_signal) +{%- endmacro -%} + +{%- macro CGROUP_CONTEXT_CONFIG_ITEMS(type) -%} +{{type}}.Slice, config_parse_unit_slice, 0, 0 +{{type}}.AllowedCPUs, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.cpuset_cpus) +{{type}}.StartupAllowedCPUs, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.startup_cpuset_cpus) +{{type}}.AllowedMemoryNodes, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.cpuset_mems) +{{type}}.StartupAllowedMemoryNodes, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.startup_cpuset_mems) +{{type}}.CPUAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.cpu_accounting) +{{type}}.CPUWeight, config_parse_cg_cpu_weight, 0, offsetof({{type}}, cgroup_context.cpu_weight) +{{type}}.StartupCPUWeight, config_parse_cg_cpu_weight, 0, offsetof({{type}}, cgroup_context.startup_cpu_weight) +{{type}}.CPUShares, config_parse_cpu_shares, 0, offsetof({{type}}, cgroup_context.cpu_shares) +{{type}}.StartupCPUShares, config_parse_cpu_shares, 0, offsetof({{type}}, cgroup_context.startup_cpu_shares) +{{type}}.CPUQuota, config_parse_cpu_quota, 0, offsetof({{type}}, cgroup_context) +{{type}}.CPUQuotaPeriodSec, config_parse_sec_def_infinity, 0, offsetof({{type}}, cgroup_context.cpu_quota_period_usec) +{{type}}.MemoryAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.memory_accounting) +{{type}}.MemoryMin, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.DefaultMemoryMin, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.DefaultMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.DefaultStartupMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.StartupMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryHigh, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.StartupMemoryHigh, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.StartupMemoryMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.StartupMemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.StartupMemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryLimit, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.DeviceAllow, config_parse_device_allow, 0, offsetof({{type}}, cgroup_context) +{{type}}.DevicePolicy, config_parse_device_policy, 0, offsetof({{type}}, cgroup_context.device_policy) +{{type}}.IOAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.io_accounting) +{{type}}.IOWeight, config_parse_cg_weight, 0, offsetof({{type}}, cgroup_context.io_weight) +{{type}}.StartupIOWeight, config_parse_cg_weight, 0, offsetof({{type}}, cgroup_context.startup_io_weight) +{{type}}.IODeviceWeight, config_parse_io_device_weight, 0, offsetof({{type}}, cgroup_context) +{{type}}.IOReadBandwidthMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.IOWriteBandwidthMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.IOReadIOPSMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.IOWriteIOPSMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.IODeviceLatencyTargetSec, config_parse_io_device_latency, 0, offsetof({{type}}, cgroup_context) +{{type}}.BlockIOAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.blockio_accounting) +{{type}}.BlockIOWeight, config_parse_blockio_weight, 0, offsetof({{type}}, cgroup_context.blockio_weight) +{{type}}.StartupBlockIOWeight, config_parse_blockio_weight, 0, offsetof({{type}}, cgroup_context.startup_blockio_weight) +{{type}}.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof({{type}}, cgroup_context) +{{type}}.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof({{type}}, cgroup_context) +{{type}}.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof({{type}}, cgroup_context) +{{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting) +{{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max) +{{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context) +{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context) +{{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context) +{{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting) +{{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow) +{{type}}.IPAddressDeny, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_deny) +{{type}}.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof({{type}}, cgroup_context.ip_filters_ingress) +{{type}}.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof({{type}}, cgroup_context.ip_filters_egress) +{{type}}.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_swap) +{{type}}.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure) +{{type}}.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_limit) +{{type}}.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof({{type}}, cgroup_context.moom_preference) +{{type}}.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0 +{{type}}.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof({{type}}, cgroup_context) +{{type}}.SocketBindAllow, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_allow) +{{type}}.SocketBindDeny, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_deny) +{{type}}.RestrictNetworkInterfaces, config_parse_restrict_network_interfaces, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.memory_pressure_threshold_usec) +{{type}}.MemoryPressureWatch, config_parse_memory_pressure_watch, 0, offsetof({{type}}, cgroup_context.memory_pressure_watch) +{{type}}.NFTSet, config_parse_cgroup_nft_set, NFT_SET_PARSE_CGROUP, offsetof({{type}}, cgroup_context) +{{type}}.CoredumpReceive, config_parse_bool, 0, offsetof({{type}}, cgroup_context.coredump_receive) +{%- endmacro -%} + +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "all-units.h" +#include "conf-parser.h" +#include "image-policy.h" +#include "in-addr-prefix-util.h" +#include "load-fragment.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name load_fragment_gperf_hash +%define lookup-function-name load_fragment_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation) +Unit.SourcePath, config_parse_unit_path_printf, 0, offsetof(Unit, source_path) +Unit.Requires, config_parse_unit_deps, UNIT_REQUIRES, 0 +Unit.Requisite, config_parse_unit_deps, UNIT_REQUISITE, 0 +Unit.Wants, config_parse_unit_deps, UNIT_WANTS, 0 +Unit.BindsTo, config_parse_unit_deps, UNIT_BINDS_TO, 0 +Unit.BindTo, config_parse_unit_deps, UNIT_BINDS_TO, 0 +Unit.Upholds, config_parse_unit_deps, UNIT_UPHOLDS, 0 +Unit.Conflicts, config_parse_unit_deps, UNIT_CONFLICTS, 0 +Unit.Before, config_parse_unit_deps, UNIT_BEFORE, 0 +Unit.After, config_parse_unit_deps, UNIT_AFTER, 0 +Unit.OnSuccess, config_parse_unit_deps, UNIT_ON_SUCCESS, 0 +Unit.OnFailure, config_parse_unit_deps, UNIT_ON_FAILURE, 0 +Unit.PropagatesReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0 +Unit.PropagateReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0 +Unit.ReloadPropagatedFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0 +Unit.PropagateReloadFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0 +Unit.PropagatesStopTo, config_parse_unit_deps, UNIT_PROPAGATES_STOP_TO, 0 +Unit.StopPropagatedFrom, config_parse_unit_deps, UNIT_STOP_PROPAGATED_FROM, 0 +Unit.PartOf, config_parse_unit_deps, UNIT_PART_OF, 0 +Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0 +Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0 +Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0 +Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0 +Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded) +Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start) +Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop) +Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate) +Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies) +Unit.SurviveFinalKillSignal, config_parse_bool, 0, offsetof(Unit, survive_final_kill_signal) +Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode) +Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode) +{# The following is a legacy alias name for compatibility #} +Unit.OnFailureIsolate, config_parse_job_mode_isolate, 0, offsetof(Unit, on_failure_job_mode) +Unit.IgnoreOnIsolate, config_parse_bool, 0, offsetof(Unit, ignore_on_isolate) +Unit.IgnoreOnSnapshot, config_parse_warn_compat, DISABLED_LEGACY, 0 +Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0 +Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0 +Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action) +Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg) +Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) +{# The following is a legacy alias name for compatibility #} +Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) +Unit.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst) +Unit.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action) +Unit.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action) +Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action) +Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status) +Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status) +Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions) +Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions) +Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions) +Unit.ConditionPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, conditions) +Unit.ConditionPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, conditions) +Unit.ConditionPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, conditions) +Unit.ConditionPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, conditions) +Unit.ConditionDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, conditions) +Unit.ConditionFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, conditions) +Unit.ConditionFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, conditions) +Unit.ConditionNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, conditions) +Unit.ConditionFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, conditions) +Unit.ConditionArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, conditions) +Unit.ConditionFirmware, config_parse_unit_condition_string, CONDITION_FIRMWARE, offsetof(Unit, conditions) +Unit.ConditionVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, conditions) +Unit.ConditionHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, conditions) +Unit.ConditionKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, conditions) +Unit.ConditionKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, conditions) +Unit.ConditionCredential, config_parse_unit_condition_string, CONDITION_CREDENTIAL, offsetof(Unit, conditions) +Unit.ConditionSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, conditions) +Unit.ConditionCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, conditions) +Unit.ConditionACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, conditions) +Unit.ConditionMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, conditions) +Unit.ConditionCPUFeature, config_parse_unit_condition_string, CONDITION_CPU_FEATURE, offsetof(Unit, conditions) +Unit.ConditionCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, conditions) +Unit.ConditionEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, conditions) +Unit.ConditionUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, conditions) +Unit.ConditionGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, conditions) +Unit.ConditionControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, conditions) +Unit.ConditionOSRelease, config_parse_unit_condition_string, CONDITION_OS_RELEASE, offsetof(Unit, conditions) +Unit.ConditionMemoryPressure, config_parse_unit_condition_string, CONDITION_MEMORY_PRESSURE, offsetof(Unit, conditions) +Unit.ConditionCPUPressure, config_parse_unit_condition_string, CONDITION_CPU_PRESSURE, offsetof(Unit, conditions) +Unit.ConditionIOPressure, config_parse_unit_condition_string, CONDITION_IO_PRESSURE, offsetof(Unit, conditions) +Unit.AssertPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, asserts) +Unit.AssertPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, asserts) +Unit.AssertPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, asserts) +Unit.AssertPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, asserts) +Unit.AssertPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, asserts) +Unit.AssertPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, asserts) +Unit.AssertPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, asserts) +Unit.AssertDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, asserts) +Unit.AssertFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, asserts) +Unit.AssertFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, asserts) +Unit.AssertNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, asserts) +Unit.AssertFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, asserts) +Unit.AssertArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, asserts) +Unit.AssertVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, asserts) +Unit.AssertHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, asserts) +Unit.AssertKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, asserts) +Unit.AssertKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, asserts) +Unit.AssertCredential, config_parse_unit_condition_string, CONDITION_CREDENTIAL, offsetof(Unit, asserts) +Unit.AssertSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, asserts) +Unit.AssertCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, asserts) +Unit.AssertACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, asserts) +Unit.AssertMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, asserts) +Unit.AssertCPUFeature, config_parse_unit_condition_string, CONDITION_CPU_FEATURE, offsetof(Unit, asserts) +Unit.AssertCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, asserts) +Unit.AssertEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, asserts) +Unit.AssertUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, asserts) +Unit.AssertGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, asserts) +Unit.AssertControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, asserts) +Unit.AssertOSRelease, config_parse_unit_condition_string, CONDITION_OS_RELEASE, offsetof(Unit, asserts) +Unit.AssertMemoryPressure, config_parse_unit_condition_string, CONDITION_MEMORY_PRESSURE, offsetof(Unit, asserts) +Unit.AssertCPUPressure, config_parse_unit_condition_string, CONDITION_CPU_PRESSURE, offsetof(Unit, asserts) +Unit.AssertIOPressure, config_parse_unit_condition_string, CONDITION_IO_PRESSURE, offsetof(Unit, asserts) +Unit.CollectMode, config_parse_collect_mode, 0, offsetof(Unit, collect_mode) +Service.PIDFile, config_parse_pid_file, 0, offsetof(Service, pid_file) +Service.ExecCondition, config_parse_exec, SERVICE_EXEC_CONDITION, offsetof(Service, exec_command) +Service.ExecStartPre, config_parse_exec, SERVICE_EXEC_START_PRE, offsetof(Service, exec_command) +Service.ExecStart, config_parse_exec, SERVICE_EXEC_START, offsetof(Service, exec_command) +Service.ExecStartPost, config_parse_exec, SERVICE_EXEC_START_POST, offsetof(Service, exec_command) +Service.ExecReload, config_parse_exec, SERVICE_EXEC_RELOAD, offsetof(Service, exec_command) +Service.ExecStop, config_parse_exec, SERVICE_EXEC_STOP, offsetof(Service, exec_command) +Service.ExecStopPost, config_parse_exec, SERVICE_EXEC_STOP_POST, offsetof(Service, exec_command) +Service.RestartSec, config_parse_sec, 0, offsetof(Service, restart_usec) +Service.RestartSteps, config_parse_unsigned, 0, offsetof(Service, restart_steps) +Service.RestartMaxDelaySec, config_parse_sec, 0, offsetof(Service, restart_max_delay_usec) +Service.TimeoutSec, config_parse_service_timeout, 0, 0 +Service.TimeoutStartSec, config_parse_service_timeout, 0, 0 +Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec) +Service.TimeoutAbortSec, config_parse_service_timeout_abort, 0, 0 +Service.TimeoutStartFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_start_failure_mode) +Service.TimeoutStopFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_stop_failure_mode) +Service.RuntimeMaxSec, config_parse_sec, 0, offsetof(Service, runtime_max_usec) +Service.RuntimeRandomizedExtraSec, config_parse_sec, 0, offsetof(Service, runtime_rand_extra_usec) +Service.WatchdogSec, config_parse_sec, 0, offsetof(Service, watchdog_usec) +{# The following five only exist for compatibility, they moved into Unit, see above #} +Service.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) +Service.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst) +Service.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action) +Service.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action) +Service.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Service.Type, config_parse_service_type, 0, offsetof(Service, type) +Service.ExitType, config_parse_service_exit_type, 0, offsetof(Service, exit_type) +Service.Restart, config_parse_service_restart, 0, offsetof(Service, restart) +Service.RestartMode, config_parse_service_restart_mode, 0, offsetof(Service, restart_mode) +Service.PermissionsStartOnly, config_parse_bool, 0, offsetof(Service, permissions_start_only) +Service.RootDirectoryStartOnly, config_parse_bool, 0, offsetof(Service, root_directory_start_only) +Service.RemainAfterExit, config_parse_bool, 0, offsetof(Service, remain_after_exit) +Service.GuessMainPID, config_parse_bool, 0, offsetof(Service, guess_main_pid) +Service.RestartPreventExitStatus, config_parse_set_status, 0, offsetof(Service, restart_prevent_status) +Service.RestartForceExitStatus, config_parse_set_status, 0, offsetof(Service, restart_force_status) +Service.SuccessExitStatus, config_parse_set_status, 0, offsetof(Service, success_status) +Service.SysVStartPriority, config_parse_warn_compat, DISABLED_LEGACY, 0 +Service.NonBlocking, config_parse_bool, 0, offsetof(Service, exec_context.non_blocking) +Service.BusName, config_parse_bus_name, 0, offsetof(Service, bus_name) +Service.FileDescriptorStoreMax, config_parse_unsigned, 0, offsetof(Service, n_fd_store_max) +Service.FileDescriptorStorePreserve, config_parse_exec_preserve_mode, 0, offsetof(Service, fd_store_preserve_mode) +Service.NotifyAccess, config_parse_notify_access, 0, offsetof(Service, notify_access) +Service.Sockets, config_parse_service_sockets, 0, 0 +Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0 +Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors) +Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings) +Service.OOMPolicy, config_parse_oom_policy, 0, offsetof(Service, oom_policy) +Service.OpenFile, config_parse_open_file, 0, offsetof(Service, open_files) +Service.ReloadSignal, config_parse_signal, 0, offsetof(Service, reload_signal) +{{ EXEC_CONTEXT_CONFIG_ITEMS('Service') }} +{{ CGROUP_CONTEXT_CONFIG_ITEMS('Service') }} +{{ KILL_CONTEXT_CONFIG_ITEMS('Service') }} +Socket.ListenStream, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenDatagram, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenSequentialPacket, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenFIFO, config_parse_socket_listen, SOCKET_FIFO, 0 +Socket.ListenNetlink, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenSpecial, config_parse_socket_listen, SOCKET_SPECIAL, 0 +Socket.ListenMessageQueue, config_parse_socket_listen, SOCKET_MQUEUE, 0 +Socket.ListenUSBFunction, config_parse_socket_listen, SOCKET_USB_FUNCTION, 0 +Socket.SocketProtocol, config_parse_socket_protocol, 0, offsetof(Socket, socket_protocol) +Socket.BindIPv6Only, config_parse_socket_bind, 0, offsetof(Socket, bind_ipv6_only) +Socket.Backlog, config_parse_unsigned, 0, offsetof(Socket, backlog) +Socket.BindToDevice, config_parse_socket_bindtodevice, 0, 0 +Socket.ExecStartPre, config_parse_exec, SOCKET_EXEC_START_PRE, offsetof(Socket, exec_command) +Socket.ExecStartPost, config_parse_exec, SOCKET_EXEC_START_POST, offsetof(Socket, exec_command) +Socket.ExecStopPre, config_parse_exec, SOCKET_EXEC_STOP_PRE, offsetof(Socket, exec_command) +Socket.ExecStopPost, config_parse_exec, SOCKET_EXEC_STOP_POST, offsetof(Socket, exec_command) +Socket.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Socket, timeout_usec) +Socket.SocketUser, config_parse_user_group_compat, 0, offsetof(Socket, user) +Socket.SocketGroup, config_parse_user_group_compat, 0, offsetof(Socket, group) +Socket.SocketMode, config_parse_mode, 0, offsetof(Socket, socket_mode) +Socket.DirectoryMode, config_parse_mode, 0, offsetof(Socket, directory_mode) +Socket.Accept, config_parse_bool, 0, offsetof(Socket, accept) +Socket.FlushPending, config_parse_bool, 0, offsetof(Socket, flush_pending) +Socket.Writable, config_parse_bool, 0, offsetof(Socket, writable) +Socket.MaxConnections, config_parse_unsigned, 0, offsetof(Socket, max_connections) +Socket.MaxConnectionsPerSource, config_parse_unsigned, 0, offsetof(Socket, max_connections_per_source) +Socket.KeepAlive, config_parse_bool, 0, offsetof(Socket, keep_alive) +Socket.KeepAliveTimeSec, config_parse_sec, 0, offsetof(Socket, keep_alive_time) +Socket.KeepAliveIntervalSec, config_parse_sec, 0, offsetof(Socket, keep_alive_interval) +Socket.KeepAliveProbes, config_parse_unsigned, 0, offsetof(Socket, keep_alive_cnt) +Socket.DeferAcceptSec, config_parse_sec, 0, offsetof(Socket, defer_accept) +Socket.NoDelay, config_parse_bool, 0, offsetof(Socket, no_delay) +Socket.Priority, config_parse_int, 0, offsetof(Socket, priority) +Socket.ReceiveBuffer, config_parse_iec_size, 0, offsetof(Socket, receive_buffer) +Socket.SendBuffer, config_parse_iec_size, 0, offsetof(Socket, send_buffer) +Socket.IPTOS, config_parse_ip_tos, 0, offsetof(Socket, ip_tos) +Socket.IPTTL, config_parse_int, 0, offsetof(Socket, ip_ttl) +Socket.Mark, config_parse_int, 0, offsetof(Socket, mark) +Socket.PipeSize, config_parse_iec_size, 0, offsetof(Socket, pipe_size) +Socket.FreeBind, config_parse_bool, 0, offsetof(Socket, free_bind) +Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent) +Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast) +Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred) +Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec) +Socket.PassPacketInfo, config_parse_bool, 0, offsetof(Socket, pass_pktinfo) +Socket.Timestamping, config_parse_socket_timestamping, 0, offsetof(Socket, timestamping) +Socket.TCPCongestion, config_parse_string, 0, offsetof(Socket, tcp_congestion) +Socket.ReusePort, config_parse_bool, 0, offsetof(Socket, reuse_port) +Socket.MessageQueueMaxMessages, config_parse_long, 0, offsetof(Socket, mq_maxmsg) +Socket.MessageQueueMessageSize, config_parse_long, 0, offsetof(Socket, mq_msgsize) +Socket.RemoveOnStop, config_parse_bool, 0, offsetof(Socket, remove_on_stop) +Socket.Symlinks, config_parse_unit_path_strv_printf, 0, offsetof(Socket, symlinks) +Socket.FileDescriptorName, config_parse_fdname, 0, 0 +Socket.Service, config_parse_socket_service, 0, 0 +Socket.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, trigger_limit.interval) +Socket.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Socket, trigger_limit.burst) +Socket.PollLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, poll_limit_interval) +Socket.PollLimitBurst, config_parse_unsigned, 0, offsetof(Socket, poll_limit_burst) +{% if ENABLE_SMACK %} +Socket.SmackLabel, config_parse_unit_string_printf, 0, offsetof(Socket, smack) +Socket.SmackLabelIPIn, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_in) +Socket.SmackLabelIPOut, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_out) +{% else %} +Socket.SmackLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +Socket.SmackLabelIPIn, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +Socket.SmackLabelIPOut, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{% if HAVE_SELINUX %} +Socket.SELinuxContextFromNet, config_parse_bool, 0, offsetof(Socket, selinux_context_from_net) +{% else %} +Socket.SELinuxContextFromNet, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +{% endif %} +{{ EXEC_CONTEXT_CONFIG_ITEMS('Socket') }} +{{ CGROUP_CONTEXT_CONFIG_ITEMS('Socket') }} +{{ KILL_CONTEXT_CONFIG_ITEMS('Socket') }} +Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what) +Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where) +Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options) +Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype) +Mount.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Mount, timeout_usec) +Mount.DirectoryMode, config_parse_mode, 0, offsetof(Mount, directory_mode) +Mount.SloppyOptions, config_parse_bool, 0, offsetof(Mount, sloppy_options) +Mount.LazyUnmount, config_parse_bool, 0, offsetof(Mount, lazy_unmount) +Mount.ForceUnmount, config_parse_bool, 0, offsetof(Mount, force_unmount) +Mount.ReadWriteOnly, config_parse_bool, 0, offsetof(Mount, read_write_only) +{{ EXEC_CONTEXT_CONFIG_ITEMS('Mount') }} +{{ CGROUP_CONTEXT_CONFIG_ITEMS('Mount') }} +{{ KILL_CONTEXT_CONFIG_ITEMS('Mount') }} +Automount.Where, config_parse_unit_path_printf, 0, offsetof(Automount, where) +Automount.ExtraOptions, config_parse_unit_string_printf, 0, offsetof(Automount, extra_options) +Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode) +Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec) +Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what) +Swap.Priority, config_parse_swap_priority, 0, 0 +Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options) +Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec) +{{ EXEC_CONTEXT_CONFIG_ITEMS('Swap') }} +{{ CGROUP_CONTEXT_CONFIG_ITEMS('Swap') }} +{{ KILL_CONTEXT_CONFIG_ITEMS('Swap') }} +Timer.OnCalendar, config_parse_timer, TIMER_CALENDAR, 0 +Timer.OnActiveSec, config_parse_timer, TIMER_ACTIVE, 0 +Timer.OnBootSec, config_parse_timer, TIMER_BOOT, 0 +Timer.OnStartupSec, config_parse_timer, TIMER_STARTUP, 0 +Timer.OnUnitActiveSec, config_parse_timer, TIMER_UNIT_ACTIVE, 0 +Timer.OnUnitInactiveSec, config_parse_timer, TIMER_UNIT_INACTIVE, 0 +Timer.OnClockChange, config_parse_bool, 0, offsetof(Timer, on_clock_change) +Timer.OnTimezoneChange, config_parse_bool, 0, offsetof(Timer, on_timezone_change) +Timer.Persistent, config_parse_bool, 0, offsetof(Timer, persistent) +Timer.WakeSystem, config_parse_bool, 0, offsetof(Timer, wake_system) +Timer.RemainAfterElapse, config_parse_bool, 0, offsetof(Timer, remain_after_elapse) +Timer.FixedRandomDelay, config_parse_bool, 0, offsetof(Timer, fixed_random_delay) +Timer.AccuracySec, config_parse_sec, 0, offsetof(Timer, accuracy_usec) +Timer.RandomizedDelaySec, config_parse_sec, 0, offsetof(Timer, random_usec) +Timer.Unit, config_parse_trigger_unit, 0, 0 +Path.PathExists, config_parse_path_spec, 0, 0 +Path.PathExistsGlob, config_parse_path_spec, 0, 0 +Path.PathChanged, config_parse_path_spec, 0, 0 +Path.PathModified, config_parse_path_spec, 0, 0 +Path.DirectoryNotEmpty, config_parse_path_spec, 0, 0 +Path.Unit, config_parse_trigger_unit, 0, 0 +Path.MakeDirectory, config_parse_bool, 0, offsetof(Path, make_directory) +Path.DirectoryMode, config_parse_mode, 0, offsetof(Path, directory_mode) +Path.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Path, trigger_limit.interval) +Path.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Path, trigger_limit.burst) +{{ CGROUP_CONTEXT_CONFIG_ITEMS('Slice') }} +{{ CGROUP_CONTEXT_CONFIG_ITEMS('Scope') }} +{{ KILL_CONTEXT_CONFIG_ITEMS('Scope') }} +Scope.RuntimeMaxSec, config_parse_sec, 0, offsetof(Scope, runtime_max_usec) +Scope.RuntimeRandomizedExtraSec, config_parse_sec, 0, offsetof(Scope, runtime_rand_extra_usec) +Scope.TimeoutStopSec, config_parse_sec, 0, offsetof(Scope, timeout_stop_usec) +Scope.OOMPolicy, config_parse_oom_policy, 0, offsetof(Scope, oom_policy) +{# The [Install] section is ignored here #} +Install.Alias, NULL, 0, 0 +Install.WantedBy, NULL, 0, 0 +Install.RequiredBy, NULL, 0, 0 +Install.UpheldBy, NULL, 0, 0 +Install.Also, NULL, 0, 0 +Install.DefaultInstance, NULL, 0, 0 diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c new file mode 100644 index 0000000..0baf08e --- /dev/null +++ b/src/core/load-fragment.c @@ -0,0 +1,6735 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2012 Holger Hans Peter Freyther +***/ + +#include +#include +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "af-list.h" +#include "all-units.h" +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-lsm.h" +#include "bpf-program.h" +#include "bpf-socket-bind.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "conf-parser.h" +#include "core-varlink.h" +#include "cpu-set-util.h" +#include "creds-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "exec-credential.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio.h" +#include "firewall-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "iovec-util.h" +#include "ioprio-util.h" +#include "ip-protocol-list.h" +#include "journal-file.h" +#include "limits-util.h" +#include "load-fragment.h" +#include "log.h" +#include "missing_ioprio.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "open-file.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "path-util.h" +#include "pcre2-util.h" +#include "percent-util.h" +#include "process-util.h" +#include "seccomp-util.h" +#include "securebits-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "socket-netlink.h" +#include "specifier.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "time-util.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "user-util.h" +#include "utf8.h" +#include "web-util.h" + +static int parse_socket_protocol(const char *s) { + int r; + + r = parse_ip_protocol(s); + if (r < 0) + return r; + if (!IN_SET(r, IPPROTO_UDPLITE, IPPROTO_SCTP)) + return -EPROTONOSUPPORT; + + return r; +} + +int parse_crash_chvt(const char *value, int *data) { + int b; + + if (safe_atoi(value, data) >= 0) + return 0; + + b = parse_boolean(value); + if (b < 0) + return b; + + if (b > 0) + *data = 0; /* switch to where kmsg goes */ + else + *data = -1; /* turn off switching */ + + return 0; +} + +int parse_confirm_spawn(const char *value, char **console) { + char *s; + int r; + + r = value ? parse_boolean(value) : 1; + if (r == 0) { + *console = NULL; + return 0; + } else if (r > 0) /* on with default tty */ + s = strdup("/dev/console"); + else if (is_path(value)) /* on with fully qualified path */ + s = strdup(value); + else /* on with only a tty file name, not a fully qualified path */ + s = path_join("/dev/", value); + if (!s) + return -ENOMEM; + + *console = s; + return 0; +} + +DEFINE_CONFIG_PARSE(config_parse_socket_protocol, parse_socket_protocol, "Failed to parse socket protocol"); +DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Failed to parse secure bits"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_home, protect_home, ProtectHome, "Failed to parse protect home value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_system, protect_system, ProtectSystem, "Failed to parse protect system value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode, "Failed to parse resource preserve mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_exit_type, service_exit_type, ServiceExitType, "Failed to parse service exit type"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart_mode, service_restart_mode, ServiceRestartMode, "Failed to parse service restart mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference="); +DEFINE_CONFIG_PARSE_ENUM(config_parse_memory_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch, "Failed to parse memory pressure watch setting"); +DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); +DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); +DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); +DEFINE_CONFIG_PARSE_PTR(config_parse_cg_cpu_weight, cg_cpu_weight_parse, uint64_t, "Invalid CPU weight"); +static DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares_internal, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares"); +DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_propagation_flag, mount_propagation_flag_from_string, unsigned long, "Failed to parse mount propagation flag"); +DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat, "Failed to parse status unit format"); +DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping, "Failed to parse timestamping precision"); + +int config_parse_cpu_shares( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit uses %s=; please use CPUWeight= instead. Support for %s= will be removed soon.", + lvalue, lvalue); + + return config_parse_cpu_shares_internal(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} + +bool contains_instance_specifier_superset(const char *s) { + const char *p, *q; + bool percent = false; + + assert(s); + + p = strchr(s, '@'); + if (!p) + return false; + + p++; /* Skip '@' */ + + q = strrchr(p, '.'); + if (!q) + q = p + strlen(p); + + /* If the string is just the instance specifier, it's not a superset of the instance. */ + if (memcmp_nn(p, q - p, "%i", strlen("%i")) == 0) + return false; + + /* %i, %n and %N all expand to the instance or a superset of it. */ + for (; p < q; p++) + if (*p == '%') + percent = !percent; + else if (percent) { + if (IN_SET(*p, 'n', 'N', 'i')) + return true; + percent = false; + } + + return false; +} + +/* `name` is the rendered version of `format` via `unit_printf` or similar functions. */ +int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format) { + const char *fragment_path; + int r; + + assert(u); + assert(name); + + /* If a template unit has a direct dependency on itself that includes the unit instance as part of + * the template instance via a unit specifier (%i, %n or %N), this will almost certainly lead to + * infinite recursion as systemd will keep instantiating new instances of the template unit. + * https://github.com/systemd/systemd/issues/17602 shows a good example of how this can happen in + * practice. To guard against this, we check for templates that depend on themselves and have the + * instantiated unit instance included as part of the template instance of the dependency via a + * specifier. + * + * For example, if systemd-notify@.service depends on systemd-notify@%n.service, this will result in + * infinite recursion. + */ + + if (!unit_name_is_valid(name, UNIT_NAME_INSTANCE)) + return false; + + if (!unit_name_prefix_equal(u->id, name)) + return false; + + if (u->type != unit_name_to_type(name)) + return false; + + r = unit_file_find_fragment(u->manager->unit_id_map, u->manager->unit_name_map, name, &fragment_path, NULL); + if (r < 0) + return r; + + /* Fragment paths should also be equal as a custom fragment for a specific template instance + * wouldn't necessarily lead to infinite recursion. */ + if (!path_equal_ptr(u->fragment_path, fragment_path)) + return false; + + if (!contains_instance_specifier_superset(format)) + return false; + + return true; +} + +int config_parse_unit_deps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + UnitDependency d = ltype; + Unit *u = userdata; + + assert(filename); + assert(lvalue); + assert(rvalue); + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + int r; + + r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_name_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + r = unit_is_likely_recursive_template_dependency(u, k, word); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to determine if '%s' is a recursive dependency, ignoring: %m", k); + continue; + } + if (r > 0) { + log_syntax(unit, LOG_DEBUG, filename, line, 0, + "Dropping dependency %s=%s that likely leads to infinite recursion.", + unit_dependency_to_string(d), word); + continue; + } + + r = unit_add_dependency_by_name(u, d, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + } +} + +int config_parse_obsolete_unit_deps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit dependency type %s= is obsolete, replacing by %s=, please update your unit file", lvalue, unit_dependency_to_string(ltype)); + + return config_parse_unit_deps(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_unit_string_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_strv_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = ASSERT_PTR(userdata); + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + return config_parse_strv(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_path_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = ASSERT_PTR(userdata); + int r; + bool fatal = ltype; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_path_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, fatal ? "" : ", ignoring"); + return fatal ? -ENOEXEC : 0; + } + + return config_parse_path(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_colon_separated_paths( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + char ***sv = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *sv = strv_free(*sv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + r = unit_path_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_consume(sv, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int config_parse_unit_path_strv_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***x = data; + const Unit *u = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *x = strv_free(*x); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_path_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_consume(x, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } +} + +static int patch_var_run( + const char *unit, + const char *filename, + unsigned line, + const char *lvalue, + char **path) { + + const char *e; + char *z; + + e = path_startswith(*path, "/var/run/"); + if (!e) + return 0; + + z = path_join("/run/", e); + if (!z) + return log_oom(); + + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "%s= references a path below legacy directory /var/run/, updating %s → %s; " + "please update the unit file accordingly.", lvalue, *path, z); + + free_and_replace(*path, z); + + return 1; +} + +int config_parse_socket_listen( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ SocketPort *p = NULL; + SocketPort *tail; + Socket *s; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + s = SOCKET(data); + + if (isempty(rvalue)) { + /* An empty assignment removes all ports */ + socket_free_ports(s); + return 0; + } + + p = new0(SocketPort, 1); + if (!p) + return log_oom(); + + if (ltype != SOCKET_SOCKET) { + _cleanup_free_ char *k = NULL; + + r = unit_path_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (ltype == SOCKET_FIFO) { + r = patch_var_run(unit, filename, line, lvalue, &k); + if (r < 0) + return r; + } + + free_and_replace(p->path, k); + p->type = ltype; + + } else if (streq(lvalue, "ListenNetlink")) { + _cleanup_free_ char *k = NULL; + + r = unit_path_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = socket_address_parse_netlink(&p->address, k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k); + return 0; + } + + p->type = SOCKET_SOCKET; + + } else { + _cleanup_free_ char *k = NULL; + + r = unit_path_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */ + r = patch_var_run(unit, filename, line, lvalue, &k); + if (r < 0) + return r; + } + + r = socket_address_parse_and_warn(&p->address, k); + if (r < 0) { + if (r != -EAFNOSUPPORT) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k); + return 0; + } + + if (streq(lvalue, "ListenStream")) + p->address.type = SOCK_STREAM; + else if (streq(lvalue, "ListenDatagram")) + p->address.type = SOCK_DGRAM; + else { + assert(streq(lvalue, "ListenSequentialPacket")); + p->address.type = SOCK_SEQPACKET; + } + + if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Address family not supported, ignoring: %s", rvalue); + return 0; + } + + p->type = SOCKET_SOCKET; + } + + p->fd = -EBADF; + p->auxiliary_fds = NULL; + p->n_auxiliary_fds = 0; + p->socket = s; + + tail = LIST_FIND_TAIL(port, s->ports); + LIST_INSERT_AFTER(port, s->ports, tail, p); + + p = NULL; + + return 0; +} + +int config_parse_exec_nice( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int priority, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->nice_set = false; + return 0; + } + + r = parse_nice(rvalue, &priority); + if (r < 0) { + if (r == -ERANGE) + log_syntax(unit, LOG_WARNING, filename, line, r, "Nice priority out of range, ignoring: %s", rvalue); + else + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse nice priority '%s', ignoring: %m", rvalue); + return 0; + } + + c->nice = priority; + c->nice_set = true; + + return 0; +} + +int config_parse_exec_oom_score_adjust( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int oa, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->oom_score_adjust_set = false; + return 0; + } + + r = parse_oom_score_adjust(rvalue, &oa); + if (r < 0) { + if (r == -ERANGE) + log_syntax(unit, LOG_WARNING, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue); + else + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue); + return 0; + } + + c->oom_score_adjust = oa; + c->oom_score_adjust_set = true; + + return 0; +} + +int config_parse_exec_coredump_filter( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->coredump_filter = 0; + c->coredump_filter_set = false; + return 0; + } + + uint64_t f; + r = coredump_filter_mask_from_string(rvalue, &f); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse the CoredumpFilter=%s, ignoring: %m", rvalue); + return 0; + } + + c->coredump_filter |= f; + c->coredump_filter_set = true; + return 0; +} + +int config_parse_kill_mode( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + KillMode *k = data, m; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + *k = KILL_CONTROL_GROUP; + return 0; + } + + m = kill_mode_from_string(rvalue); + if (m < 0) { + log_syntax(unit, LOG_WARNING, filename, line, m, + "Failed to parse kill mode specification, ignoring: %s", rvalue); + return 0; + } + + if (m == KILL_NONE) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit uses KillMode=none. " + "This is unsafe, as it disables systemd's process lifecycle management for the service. " + "Please update the service to use a safer KillMode=, such as 'mixed' or 'control-group'. " + "Support for KillMode=none is deprecated and will eventually be removed."); + + *k = m; + return 0; +} + +int config_parse_exec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecCommand **e = ASSERT_PTR(data); + const Unit *u = userdata; + const char *p; + bool semicolon; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + e += ltype; + + if (isempty(rvalue)) { + /* An empty assignment resets the list */ + *e = exec_command_free_list(*e); + return 0; + } + + p = rvalue; + do { + _cleanup_free_ char *path = NULL, *firstword = NULL; + ExecCommandFlags flags = 0; + bool ignore = false, separate_argv0 = false; + _cleanup_free_ ExecCommand *nce = NULL; + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0; + const char *f; + + semicolon = false; + + r = extract_first_word_and_warn(&p, &firstword, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); + if (r <= 0) + return 0; + + /* A lone ";" is a separator. Let's make sure we don't treat it as an executable name. */ + if (streq(firstword, ";")) { + semicolon = true; + continue; + } + + f = firstword; + for (;;) { + /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't + * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of + * argv[0]; if it's prefixed with :, we will not do environment variable substitution; + * if it's prefixed with +, it will be run with full privileges and no sandboxing; if + * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if + * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient + * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most + * other sandboxing, with some special exceptions for changing UID. + * + * The idea is that '!!' may be used to write services that can take benefit of systemd's + * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to + * privilege dropping within the daemon if the kernel does not offer that. */ + + if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) { + flags |= EXEC_COMMAND_IGNORE_FAILURE; + ignore = true; + } else if (*f == '@' && !separate_argv0) + separate_argv0 = true; + else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND)) + flags |= EXEC_COMMAND_NO_ENV_EXPAND; + else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) + flags |= EXEC_COMMAND_FULLY_PRIVILEGED; + else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) + flags |= EXEC_COMMAND_NO_SETUID; + else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) { + flags &= ~EXEC_COMMAND_NO_SETUID; + flags |= EXEC_COMMAND_AMBIENT_MAGIC; + } else + break; + f++; + } + + r = unit_path_printf(u, f, &path); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + f, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + if (isempty(path)) { + /* First word is either "-" or "@" with no command. */ + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Empty path in command line%s: '%s'", + ignore ? ", ignoring" : "", rvalue); + return ignore ? 0 : -ENOEXEC; + } + if (!string_is_safe(path)) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Executable name contains special characters%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + if (endswith(path, "/")) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Executable path specifies a directory%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + + if (!(path_is_absolute(path) ? path_is_valid(path) : filename_is_valid(path))) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Neither a valid executable name nor an absolute path%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + + if (!separate_argv0) { + char *w = NULL; + + if (!GREEDY_REALLOC0(n, nlen + 2)) + return log_oom(); + + w = strdup(path); + if (!w) + return log_oom(); + n[nlen++] = w; + n[nlen] = NULL; + } + + path_simplify(path); + + while (!isempty(p)) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + /* Check explicitly for an unquoted semicolon as + * command separator token. */ + if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) { + p++; + p += strspn(p, WHITESPACE); + semicolon = true; + break; + } + + /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc. + * extract_first_word() would return the same for all of those. */ + if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) { + char *w; + + p += 2; + p += strspn(p, WHITESPACE); + + if (!GREEDY_REALLOC0(n, nlen + 2)) + return log_oom(); + + w = strdup(";"); + if (!w) + return log_oom(); + n[nlen++] = w; + n[nlen] = NULL; + continue; + } + + r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); + if (r == 0) + break; + if (r < 0) + return ignore ? 0 : -ENOEXEC; + + r = unit_full_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in %s%s: %m", + word, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + if (!GREEDY_REALLOC(n, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(resolved); + n[nlen] = NULL; + } + + if (!n || !n[0]) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Empty executable name or zeroeth argument%s: %s", + ignore ? ", ignoring" : "", rvalue); + return ignore ? 0 : -ENOEXEC; + } + + nce = new0(ExecCommand, 1); + if (!nce) + return log_oom(); + + nce->argv = TAKE_PTR(n); + nce->path = TAKE_PTR(path); + nce->flags = flags; + + exec_command_append_list(e, nce); + + /* Do not _cleanup_free_ these. */ + nce = NULL; + + rvalue = p; + } while (semicolon); + + return 0; +} + +int config_parse_socket_bindtodevice( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Socket *s = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "*")) { + s->bind_to_device = mfree(s->bind_to_device); + return 0; + } + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(&s->bind_to_device, rvalue); +} + +int config_parse_exec_input( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + const char *n; + ExecInput ei; + int r; + + assert(filename); + assert(line); + assert(rvalue); + + n = startswith(rvalue, "fd:"); + if (n) { + _cleanup_free_ char *resolved = NULL; + + r = unit_fd_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n); + return 0; + } + + if (isempty(resolved)) + resolved = mfree(resolved); + else if (!fdname_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved); + return 0; + } + + free_and_replace(c->stdio_fdname[STDIN_FILENO], resolved); + + ei = EXEC_INPUT_NAMED_FD; + + } else if ((n = startswith(rvalue, "file:"))) { + _cleanup_free_ char *resolved = NULL; + + r = unit_path_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + free_and_replace(c->stdio_file[STDIN_FILENO], resolved); + + ei = EXEC_INPUT_FILE; + + } else { + ei = exec_input_from_string(rvalue); + if (ei < 0) { + log_syntax(unit, LOG_WARNING, filename, line, ei, "Failed to parse input specifier, ignoring: %s", rvalue); + return 0; + } + } + + c->std_input = ei; + return 0; +} + +int config_parse_exec_input_text( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *unescaped = NULL, *resolved = NULL; + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + return 0; + } + + ssize_t l = cunescape(rvalue, 0, &unescaped); + if (l < 0) { + log_syntax(unit, LOG_WARNING, filename, line, l, + "Failed to decode C escaped text '%s', ignoring: %m", rvalue); + return 0; + } + + r = unit_full_printf_full(u, unescaped, EXEC_STDIN_DATA_MAX, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", unescaped); + return 0; + } + + size_t sz = strlen(resolved); + if (c->stdin_data_size + sz + 1 < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz + 1 > EXEC_STDIN_DATA_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Standard input data too large (%zu), maximum of %zu permitted, ignoring.", + c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX); + return 0; + } + + void *p = realloc(c->stdin_data, c->stdin_data_size + sz + 1); + if (!p) + return log_oom(); + + *((char*) mempcpy((char*) p + c->stdin_data_size, resolved, sz)) = '\n'; + + c->stdin_data = p; + c->stdin_data_size += sz + 1; + + return 0; +} + +int config_parse_exec_input_data( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *p = NULL; + ExecContext *c = ASSERT_PTR(data); + size_t sz; + void *q; + int r; + + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + return 0; + } + + r = unbase64mem(rvalue, SIZE_MAX, &p, &sz); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to decode base64 data, ignoring: %s", rvalue); + return 0; + } + + assert(sz > 0); + + if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Standard input data too large (%zu), maximum of %zu permitted, ignoring.", + c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX); + return 0; + } + + q = realloc(c->stdin_data, c->stdin_data_size + sz); + if (!q) + return log_oom(); + + memcpy((uint8_t*) q + c->stdin_data_size, p, sz); + + c->stdin_data = q; + c->stdin_data_size += sz; + + return 0; +} + +int config_parse_exec_output( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + const char *n; + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + bool obsolete = false; + ExecOutput eo; + int r; + + assert(filename); + assert(line); + assert(lvalue); + assert(rvalue); + + n = startswith(rvalue, "fd:"); + if (n) { + r = unit_fd_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n); + return 0; + } + + if (isempty(resolved)) + resolved = mfree(resolved); + else if (!fdname_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved); + return 0; + } + + eo = EXEC_OUTPUT_NAMED_FD; + + } else if (streq(rvalue, "syslog")) { + eo = EXEC_OUTPUT_JOURNAL; + obsolete = true; + + } else if (streq(rvalue, "syslog+console")) { + eo = EXEC_OUTPUT_JOURNAL_AND_CONSOLE; + obsolete = true; + + } else if ((n = startswith(rvalue, "file:"))) { + + r = unit_path_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + eo = EXEC_OUTPUT_FILE; + + } else if ((n = startswith(rvalue, "append:"))) { + + r = unit_path_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + eo = EXEC_OUTPUT_FILE_APPEND; + + } else if ((n = startswith(rvalue, "truncate:"))) { + + r = unit_path_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + eo = EXEC_OUTPUT_FILE_TRUNCATE; + } else { + eo = exec_output_from_string(rvalue); + if (eo < 0) { + log_syntax(unit, LOG_WARNING, filename, line, eo, "Failed to parse output specifier, ignoring: %s", rvalue); + return 0; + } + } + + if (obsolete) + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "Standard output type %s is obsolete, automatically updating to %s. Please update your unit file, and consider removing the setting altogether.", + rvalue, exec_output_to_string(eo)); + + if (streq(lvalue, "StandardOutput")) { + if (eo == EXEC_OUTPUT_NAMED_FD) + free_and_replace(c->stdio_fdname[STDOUT_FILENO], resolved); + else + free_and_replace(c->stdio_file[STDOUT_FILENO], resolved); + + c->std_output = eo; + + } else { + assert(streq(lvalue, "StandardError")); + + if (eo == EXEC_OUTPUT_NAMED_FD) + free_and_replace(c->stdio_fdname[STDERR_FILENO], resolved); + else + free_and_replace(c->stdio_file[STDERR_FILENO], resolved); + + c->std_error = eo; + } + + return 0; +} + +int config_parse_exec_io_class(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int x; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->ioprio_set = false; + c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO; + return 0; + } + + x = ioprio_class_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse IO scheduling class, ignoring: %s", rvalue); + return 0; + } + + c->ioprio = ioprio_normalize(ioprio_prio_value(x, ioprio_prio_data(c->ioprio))); + c->ioprio_set = true; + + return 0; +} + +int config_parse_exec_io_priority(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int i, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->ioprio_set = false; + c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO; + return 0; + } + + r = ioprio_parse_priority(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse IO priority, ignoring: %s", rvalue); + return 0; + } + + c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), i)); + c->ioprio_set = true; + + return 0; +} + +int config_parse_exec_cpu_sched_policy(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int x; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->cpu_sched_set = false; + c->cpu_sched_policy = SCHED_OTHER; + c->cpu_sched_priority = 0; + return 0; + } + + x = sched_policy_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse CPU scheduling policy, ignoring: %s", rvalue); + return 0; + } + + c->cpu_sched_policy = x; + /* Moving to or from real-time policy? We need to adjust the priority */ + c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(x), sched_get_priority_max(x)); + c->cpu_sched_set = true; + + return 0; +} + +int config_parse_exec_mount_apivfs(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int k; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->mount_apivfs_set = false; + c->mount_apivfs = false; + return 0; + } + + k = parse_boolean(rvalue); + if (k < 0) { + log_syntax(unit, LOG_WARNING, filename, line, k, + "Failed to parse boolean value, ignoring: %s", + rvalue); + return 0; + } + + c->mount_apivfs_set = true; + c->mount_apivfs = k; + return 0; +} + +int config_parse_numa_mask(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + int r; + NUMAPolicy *p = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (streq(rvalue, "all")) { + r = numa_mask_add_all(&p->nodes); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create NUMA mask representing \"all\" NUMA nodes, ignoring: %m"); + } else { + r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue); + } + + return 0; +} + +int config_parse_exec_cpu_sched_prio(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int i, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = safe_atoi(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CPU scheduling priority, ignoring: %s", rvalue); + return 0; + } + + /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set later so + * we do not check the precise range, but only the generic outer bounds. */ + if (i < 0 || i > 99) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "CPU scheduling priority is out of range, ignoring: %s", rvalue); + return 0; + } + + c->cpu_sched_priority = i; + c->cpu_sched_set = true; + + return 0; +} + +int config_parse_root_image_options( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_strv_free_ char **l = NULL; + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->root_image_options = mount_options_free_all(c->root_image_options); + return 0; + } + + r = strv_split_colon_pairs(&l, rvalue); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + + STRV_FOREACH_PAIR(first, second, l) { + MountOptions *o = NULL; + _cleanup_free_ char *mount_options_resolved = NULL; + const char *mount_options = NULL, *partition = "root"; + PartitionDesignator partition_designator; + + /* Format is either 'root:foo' or 'foo' (root is implied) */ + if (!isempty(*second)) { + partition = *first; + mount_options = *second; + } else + mount_options = *first; + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) { + log_syntax(unit, LOG_WARNING, filename, line, partition_designator, + "Invalid partition name %s, ignoring", partition); + continue; + } + r = unit_full_printf(u, mount_options, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + + if (options) + LIST_JOIN(mount_options, c->root_image_options, options); + else + /* empty spaces/separators only */ + c->root_image_options = mount_options_free_all(c->root_image_options); + + return 0; +} + +int config_parse_exec_root_hash( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *roothash_decoded = NULL; + ExecContext *c = ASSERT_PTR(data); + size_t roothash_decoded_size = 0; + int r; + + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->root_hash_path = mfree(c->root_hash_path); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + return 0; + } + + if (path_is_absolute(rvalue)) { + /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */ + _cleanup_free_ char *p = NULL; + + p = strdup(rvalue); + if (!p) + return -ENOMEM; + + free_and_replace(c->root_hash_path, p); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + return 0; + } + + /* We have a roothash to decode, eg: RootHash=012345789abcdef */ + r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue); + return 0; + } + if (roothash_decoded_size < sizeof(sd_id128_t)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "RootHash= is too short, ignoring: %s", rvalue); + return 0; + } + + free_and_replace(c->root_hash, roothash_decoded); + c->root_hash_size = roothash_decoded_size; + c->root_hash_path = mfree(c->root_hash_path); + + return 0; +} + +int config_parse_exec_root_hash_sig( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *roothash_sig_decoded = NULL; + char *value; + ExecContext *c = ASSERT_PTR(data); + size_t roothash_sig_decoded_size = 0; + int r; + + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + return 0; + } + + if (path_is_absolute(rvalue)) { + /* We have the path to a roothash signature to load and decode, eg: RootHashSignature=/foo/bar.roothash.p7s */ + _cleanup_free_ char *p = NULL; + + p = strdup(rvalue); + if (!p) + return log_oom(); + + free_and_replace(c->root_hash_sig_path, p); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + return 0; + } + + if (!(value = startswith(rvalue, "base64:"))) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to decode RootHashSignature=, not a path but doesn't start with 'base64:', ignoring: %s", rvalue); + return 0; + } + + /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */ + r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue); + return 0; + } + + free_and_replace(c->root_hash_sig, roothash_sig_decoded); + c->root_hash_sig_size = roothash_sig_decoded_size; + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + + return 0; +} + +int config_parse_exec_cpu_affinity( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (streq(rvalue, "numa")) { + c->cpu_affinity_from_numa = true; + cpu_set_reset(&c->cpu_set); + + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", + rvalue); + return 0; + } + + r = parse_cpu_set_extend(k, &c->cpu_set, true, unit, filename, line, lvalue); + if (r >= 0) + c->cpu_affinity_from_numa = false; + + return 0; +} + +int config_parse_capability_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *capability_set = ASSERT_PTR(data); + uint64_t sum = 0, initial, def; + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (streq(lvalue, "CapabilityBoundingSet")) { + initial = CAP_MASK_ALL; /* initialized to all bits on */ + def = CAP_MASK_UNSET; /* not set */ + } else + def = initial = 0; /* All bits off */ + + r = capability_set_from_string(rvalue, &sum); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= specifier '%s', ignoring: %m", lvalue, rvalue); + return 0; + } + + if (sum == 0 || *capability_set == def) + /* "", "~" or uninitialized data -> replace */ + *capability_set = invert ? ~sum : sum; + else { + /* previous data -> merge */ + if (invert) + *capability_set &= ~sum; + else + *capability_set |= sum; + } + + return 0; +} + +int config_parse_exec_selinux_context( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->selinux_context = mfree(c->selinux_context); + c->selinux_context_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->selinux_context, k); + c->selinux_context_ignore = ignore; + + return 0; +} + +int config_parse_exec_apparmor_profile( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->apparmor_profile = mfree(c->apparmor_profile); + c->apparmor_profile_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->apparmor_profile, k); + c->apparmor_profile_ignore = ignore; + + return 0; +} + +int config_parse_exec_smack_process_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->smack_process_label = mfree(c->smack_process_label); + c->smack_process_label_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->smack_process_label, k); + c->smack_process_label_ignore = ignore; + + return 0; +} + +int config_parse_timer( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + _cleanup_free_ char *k = NULL; + const Unit *u = userdata; + Timer *t = ASSERT_PTR(data); + usec_t usec = 0; + TimerValue *v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets list */ + timer_free_values(t); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (ltype == TIMER_CALENDAR) { + r = calendar_spec_from_string(k, &c); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse calendar specification, ignoring: %s", k); + return 0; + } + } else { + r = parse_sec(k, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", k); + return 0; + } + } + + v = new(TimerValue, 1); + if (!v) + return log_oom(); + + *v = (TimerValue) { + .base = ltype, + .value = usec, + .calendar_spec = TAKE_PTR(c), + }; + + LIST_PREPEND(value, t->values, v); + + return 0; +} + +int config_parse_trigger_unit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Unit *u = ASSERT_PTR(data); + UnitType type; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (UNIT_TRIGGER(u)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Multiple units to trigger specified, ignoring: %s", rvalue); + return 0; + } + + r = unit_name_printf(u, rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + type = unit_name_to_type(p); + if (type < 0) { + log_syntax(unit, LOG_WARNING, filename, line, type, "Unit type not valid, ignoring: %s", rvalue); + return 0; + } + if (unit_has_name(u, p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Units cannot trigger themselves, ignoring: %s", rvalue); + return 0; + } + + r = unit_add_two_dependencies_by_name(u, UNIT_BEFORE, UNIT_TRIGGERS, p, true, UNIT_DEPENDENCY_FILE); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add trigger on %s, ignoring: %m", p); + return 0; + } + + return 0; +} + +int config_parse_path_spec(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Path *p = ASSERT_PTR(data); + PathSpec *s; + PathType b; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment clears list */ + path_free_specs(p); + return 0; + } + + b = path_type_from_string(lvalue); + if (b < 0) { + log_syntax(unit, LOG_WARNING, filename, line, b, "Failed to parse path type, ignoring: %s", lvalue); + return 0; + } + + r = unit_path_printf(UNIT(p), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + s = new0(PathSpec, 1); + if (!s) + return log_oom(); + + s->unit = UNIT(p); + s->path = TAKE_PTR(k); + s->type = b; + s->inotify_fd = -EBADF; + + LIST_PREPEND(spec, p->specs, s); + + return 0; +} + +int config_parse_socket_service( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *p = NULL; + Socket *s = ASSERT_PTR(data); + Unit *x; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_name_printf(UNIT(s), rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!endswith(p, ".service")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type service, ignoring: %s", rvalue); + return 0; + } + + r = manager_load_unit(UNIT(s)->manager, p, NULL, &error, &x); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load unit %s, ignoring: %s", rvalue, bus_error_message(&error, r)); + return 0; + } + + unit_ref_set(&s->service, UNIT(s), x); + + return 0; +} + +int config_parse_fdname( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Socket *s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + s->fdname = mfree(s->fdname); + return 0; + } + + r = unit_fd_printf(UNIT(s), rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (!fdname_is_valid(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", p); + return 0; + } + + return free_and_replace(s->fdname, p); +} + +int config_parse_service_sockets( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Trailing garbage in sockets, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = unit_name_printf(UNIT(s), word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + if (!endswith(k, ".socket")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type socket, ignoring: %s", k); + continue; + } + + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_WANTS, UNIT_AFTER, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + + r = unit_add_dependency_by_name(UNIT(s), UNIT_TRIGGERED_BY, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + } +} + +int config_parse_bus_name( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_full_printf_full(u, rvalue, SD_BUS_MAXIMUM_NAME_LENGTH, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!sd_bus_service_name_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid bus name, ignoring: %s", k); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_service_timeout( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = ASSERT_PTR(userdata); + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* This is called for two cases: TimeoutSec= and TimeoutStartSec=. */ + + /* Traditionally, these options accepted 0 to disable the timeouts. However, a timeout of 0 suggests it happens + * immediately, hence fix this to become USEC_INFINITY instead. This is in-line with how we internally handle + * all other timeouts. */ + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue); + return 0; + } + + s->start_timeout_defined = true; + s->timeout_start_usec = usec; + + if (streq(lvalue, "TimeoutSec")) + s->timeout_stop_usec = usec; + + return 0; +} + +int config_parse_timeout_abort( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t *ret = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* Note: apart from setting the arg, this returns an extra bit of information in the return value. */ + + if (isempty(rvalue)) { + *ret = 0; + return 0; /* "not set" */ + } + + r = parse_sec(rvalue, ret); + if (r < 0) + return log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= setting, ignoring: %s", lvalue, rvalue); + + return 1; /* "set" */ +} + +int config_parse_service_timeout_abort( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = ASSERT_PTR(userdata); + int r; + + r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue, + &s->timeout_abort_usec, s); + if (r >= 0) + s->timeout_abort_set = r; + return 0; +} + +int config_parse_user_group_compat( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + char **user = data; + const Unit *u = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *user = mfree(*user); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue); + return -ENOEXEC; + } + + if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k); + return -ENOEXEC; + } + + if (strstr(lvalue, "User") && streq(k, NOBODY_USER_NAME)) + log_struct(LOG_NOTICE, + "MESSAGE=%s:%u: Special user %s configured, this is not safe!", filename, line, k, + "UNIT=%s", unit, + "MESSAGE_ID=" SD_MESSAGE_NOBODY_USER_UNSUITABLE_STR, + "OFFENDING_USER=%s", k, + "CONFIG_FILE=%s", filename, + "CONFIG_LINE=%u", line); + + return free_and_replace(*user, k); +} + +int config_parse_user_group_strv_compat( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***users = data; + const Unit *u = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *users = strv_free(*users); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax: %s", rvalue); + return -ENOEXEC; + } + if (r == 0) + return 0; + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", word); + return -ENOEXEC; + } + + if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k); + return -ENOEXEC; + } + + r = strv_push(users, k); + if (r < 0) + return log_oom(); + + k = NULL; + } +} + +int config_parse_working_directory( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = ASSERT_PTR(userdata); + bool missing_ok; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->working_directory_home = false; + c->working_directory = mfree(c->working_directory); + return 0; + } + + if (rvalue[0] == '-') { + missing_ok = true; + rvalue++; + } else + missing_ok = false; + + if (streq(rvalue, "~")) { + c->working_directory_home = true; + c->working_directory = mfree(c->working_directory); + } else { + _cleanup_free_ char *k = NULL; + + r = unit_path_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, missing_ok ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in working directory path '%s'%s: %m", + rvalue, missing_ok ? ", ignoring" : ""); + return missing_ok ? 0 : -ENOEXEC; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue); + if (r < 0) + return missing_ok ? 0 : -ENOEXEC; + + c->working_directory_home = false; + free_and_replace(c->working_directory, k); + } + + c->working_directory_missing_ok = missing_ok; + return 0; +} + +int config_parse_unit_env_file(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***env = ASSERT_PTR(data); + const Unit *u = userdata; + _cleanup_free_ char *n = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment frees the list */ + *env = strv_free(*env); + return 0; + } + + r = unit_full_printf_full(u, rvalue, PATH_MAX, &n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(n[0] == '-' ? n + 1 : n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_push(env, n); + if (r < 0) + return log_oom(); + + n = NULL; + + return 0; +} + +int config_parse_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + char ***env = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *env = strv_free(*env); + return 0; + } + + /* If 'u' is set, we operate on the regular unit specifier table. Otherwise we use a manager-specific + * specifier table (in which case ltype must contain the runtime scope). */ + const Specifier *table = u ? NULL : (const Specifier[]) { + COMMON_SYSTEM_SPECIFIERS, + COMMON_TMP_SPECIFIERS, + COMMON_CREDS_SPECIFIERS(ltype), + { 'h', specifier_user_home, NULL }, + { 's', specifier_user_shell, NULL }, + }; + + for (const char *p = rvalue;; ) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + if (table) + r = specifier_printf(word, sc_arg_max(), table, NULL, NULL, &resolved); + else + r = unit_env_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in %s, ignoring: %m", word); + continue; + } + + if (!env_assignment_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid environment assignment, ignoring: %s", resolved); + continue; + } + + r = strv_env_replace_consume(env, TAKE_PTR(resolved)); + if (r < 0) + return log_error_errno(r, "Failed to update environment: %m"); + } +} + +int config_parse_pass_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **n = NULL; + const Unit *u = userdata; + char*** passenv = ASSERT_PTR(data); + size_t nlen = 0; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *passenv = strv_free(*passenv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + if (r == 0) + break; + + if (u) { + r = unit_env_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_name_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid environment name for %s, ignoring: %s", lvalue, k); + continue; + } + + if (!GREEDY_REALLOC(n, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(k); + n[nlen] = NULL; + } + + if (n) { + r = strv_extend_strv(passenv, n, true); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int config_parse_unset_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **n = NULL; + char*** unsetenv = ASSERT_PTR(data); + const Unit *u = userdata; + size_t nlen = 0; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *unsetenv = strv_free(*unsetenv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + if (r == 0) + break; + + if (u) { + r = unit_env_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_assignment_is_valid(k) && !env_name_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid environment name or assignment %s, ignoring: %s", lvalue, k); + continue; + } + + if (!GREEDY_REALLOC(n, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(k); + n[nlen] = NULL; + } + + if (n) { + r = strv_extend_strv(unsetenv, n, true); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int config_parse_log_extra_fields( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + exec_context_free_log_extra_fields(c); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + struct iovec *t; + const char *eq; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + + eq = strchr(k, '='); + if (!eq) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field lacks '=' character, ignoring: %s", k); + continue; + } + + if (!journal_field_valid(k, eq-k, false)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field name is invalid, ignoring: %s", k); + continue; + } + + t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec)); + if (!t) + return log_oom(); + + c->log_extra_fields = t; + c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE_STRING(k); + + k = NULL; + } +} + +int config_parse_log_namespace( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->log_namespace = mfree(c->log_namespace); + return 0; + } + + r = unit_full_printf_full(u, rvalue, NAME_MAX, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!log_namespace_name_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Specified log namespace name is not valid, ignoring: %s", k); + return 0; + } + + free_and_replace(c->log_namespace, k); + return 0; +} + +int config_parse_unit_condition_path( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Condition **list = ASSERT_PTR(data), *c; + ConditionType t = ltype; + bool trigger, negate; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = rvalue[0] == '|'; + if (trigger) + rvalue++; + + negate = rvalue[0] == '!'; + if (negate) + rvalue++; + + r = unit_path_printf(u, rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(p, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + c = condition_new(t, p, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_condition_string( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *s = NULL; + Condition **list = ASSERT_PTR(data), *c; + ConditionType t = ltype; + bool trigger, negate; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = *rvalue == '|'; + if (trigger) + rvalue += 1 + strspn(rvalue + 1, WHITESPACE); + + negate = *rvalue == '!'; + if (negate) + rvalue += 1 + strspn(rvalue + 1, WHITESPACE); + + r = unit_full_printf(u, rvalue, &s); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + c = condition_new(t, s, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_requires_mounts_for( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = unit_path_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved); + continue; + } + } +} + +int config_parse_documentation( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = ASSERT_PTR(userdata); + int r; + char **a, **b; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + u->documentation = strv_free(u->documentation); + return 0; + } + + r = config_parse_unit_strv_printf(unit, filename, line, section, section_line, lvalue, ltype, + rvalue, data, userdata); + if (r < 0) + return r; + + for (a = b = u->documentation; a && *a; a++) { + + if (documentation_url_is_valid(*a)) + *(b++) = *a; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid URL, ignoring: %s", *a); + free(*a); + } + } + if (b) + *b = NULL; + + return 0; +} + +#if HAVE_SECCOMP +int config_parse_syscall_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + _unused_ const Unit *u = ASSERT_PTR(userdata); + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->syscall_filter = hashmap_free(c->syscall_filter); + c->syscall_allow_list = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->syscall_filter) { + c->syscall_filter = hashmap_new(NULL); + if (!c->syscall_filter) + return log_oom(); + + if (invert) + /* Allow everything but the ones listed */ + c->syscall_allow_list = false; + else { + /* Allow nothing but the ones listed */ + c->syscall_allow_list = true; + + /* Accept default syscalls if we are on an allow_list */ + r = seccomp_parse_syscall_filter( + "@default", -1, c->syscall_filter, + SECCOMP_PARSE_PERMISSIVE|SECCOMP_PARSE_ALLOW_LIST, + unit, + NULL, 0); + if (r < 0) + return r; + } + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *name = NULL; + int num; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = parse_syscall_and_errno(word, &name, &num); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse syscall:errno, ignoring: %s", word); + continue; + } + if (!invert && num >= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Allow-listed system calls cannot take error number, ignoring: %s", word); + continue; + } + + r = seccomp_parse_syscall_filter( + name, num, c->syscall_filter, + SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE| + (invert ? SECCOMP_PARSE_INVERT : 0)| + (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + unit, filename, line); + if (r < 0) + return r; + } +} + +int config_parse_syscall_log( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + _unused_ const Unit *u = ASSERT_PTR(userdata); + bool invert = false; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->syscall_log = hashmap_free(c->syscall_log); + c->syscall_log_allow_list = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->syscall_log) { + c->syscall_log = hashmap_new(NULL); + if (!c->syscall_log) + return log_oom(); + + if (invert) + /* Log everything but the ones listed */ + c->syscall_log_allow_list = false; + else + /* Log nothing but the ones listed */ + c->syscall_log_allow_list = true; + } + + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = seccomp_parse_syscall_filter( + word, -1, c->syscall_log, + SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE| + (invert ? SECCOMP_PARSE_INVERT : 0)| + (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + unit, filename, line); + if (r < 0) + return r; + } +} + +int config_parse_syscall_archs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **archs = data; + int r; + + if (isempty(rvalue)) { + *archs = set_free(*archs); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + uint32_t a; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = seccomp_arch_from_string(word, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse system call architecture \"%s\", ignoring: %m", word); + continue; + } + + r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_syscall_errno( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int e; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "kill")) { + /* Empty assignment resets to KILL */ + c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL; + return 0; + } + + e = parse_errno(rvalue); + if (e < 0) { + log_syntax(unit, LOG_WARNING, filename, line, e, "Failed to parse error number, ignoring: %s", rvalue); + return 0; + } + if (e == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid error number, ignoring: %s", rvalue); + return 0; + } + + c->syscall_errno = e; + return 0; +} + +int config_parse_address_families( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->address_families = set_free(c->address_families); + c->address_families_allow_list = false; + return 0; + } + + if (streq(rvalue, "none")) { + /* Forbid all address families. */ + c->address_families = set_free(c->address_families); + c->address_families_allow_list = true; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->address_families) { + c->address_families = set_new(NULL); + if (!c->address_families) + return log_oom(); + + c->address_families_allow_list = !invert; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + int af; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + af = af_from_name(word); + if (af < 0) { + log_syntax(unit, LOG_WARNING, filename, line, af, + "Failed to parse address family, ignoring: %s", word); + continue; + } + + /* If we previously wanted to forbid an address family and now + * we want to allow it, then just remove it from the list. + */ + if (!invert == c->address_families_allow_list) { + r = set_put(c->address_families, INT_TO_PTR(af)); + if (r < 0) + return log_oom(); + } else + set_remove(c->address_families, INT_TO_PTR(af)); + } +} + +int config_parse_restrict_namespaces( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + unsigned long flags; + bool invert = false; + int r; + + if (isempty(rvalue)) { + /* Reset to the default. */ + c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL; + return 0; + } + + /* Boolean parameter ignores the previous settings */ + r = parse_boolean(rvalue); + if (r > 0) { + c->restrict_namespaces = 0; + return 0; + } else if (r == 0) { + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + /* Not a boolean argument, in this case it's a list of namespace types. */ + r = namespace_flags_from_string(rvalue, &flags); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue); + return 0; + } + + if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL) + /* Initial assignment. Just set the value. */ + c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags; + else + /* Merge the value with the previous one. */ + SET_FLAG(c->restrict_namespaces, flags, !invert); + + return 0; +} +#endif + +int config_parse_restrict_filesystems( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + ExecContext *c = ASSERT_PTR(data); + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->restrict_filesystems = set_free_free(c->restrict_filesystems); + c->restrict_filesystems_allow_list = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->restrict_filesystems) { + if (invert) + /* Allow everything but the ones listed */ + c->restrict_filesystems_allow_list = false; + else + /* Allow nothing but the ones listed */ + c->restrict_filesystems_allow_list = true; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + + r = lsm_bpf_parse_filesystem( + word, + &c->restrict_filesystems, + FILESYSTEM_PARSE_LOG| + (invert ? FILESYSTEM_PARSE_INVERT : 0)| + (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0), + unit, filename, line); + + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_unit_slice( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *k = NULL; + Unit *u = userdata, *slice; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_name_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = manager_load_unit(u->manager, k, NULL, &error, &slice); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load slice unit %s, ignoring: %s", k, bus_error_message(&error, r)); + return 0; + } + + r = unit_set_slice(u, slice); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to assign slice %s to unit %s, ignoring: %m", slice->id, u->id); + return 0; + } + + return 0; +} + +int config_parse_cpu_quota( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->cpu_quota_per_sec_usec = USEC_INFINITY; + return 0; + } + + r = parse_permyriad_unbounded(rvalue); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid CPU quota '%s', ignoring.", rvalue); + return 0; + } + + c->cpu_quota_per_sec_usec = ((usec_t) r * USEC_PER_SEC) / 10000U; + return 0; +} + +int config_parse_allowed_cpuset( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CPUSet *c = data; + const Unit *u = userdata; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", + rvalue); + return 0; + } + + (void) parse_cpu_set_extend(k, c, true, unit, filename, line, lvalue); + return 0; +} + +int config_parse_memory_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + uint64_t bytes = CGROUP_LIMIT_MAX; + int r; + + if (isempty(rvalue) && STR_IN_SET(lvalue, "DefaultMemoryLow", + "DefaultMemoryMin", + "MemoryLow", + "StartupMemoryLow", + "MemoryMin")) + bytes = CGROUP_LIMIT_MIN; + else if (!isempty(rvalue) && !streq(rvalue, "infinity")) { + + r = parse_permyriad(rvalue); + if (r < 0) { + r = parse_size(rvalue, 1024, &bytes); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid memory limit '%s', ignoring: %m", rvalue); + return 0; + } + } else + bytes = physical_memory_scale(r, 10000U); + + if (bytes >= UINT64_MAX || + (bytes <= 0 && !STR_IN_SET(lvalue, + "MemorySwapMax", + "StartupMemorySwapMax", + "MemoryZSwapMax", + "StartupMemoryZSwapMax", + "MemoryLow", + "StartupMemoryLow", + "MemoryMin", + "DefaultMemoryLow", + "DefaultstartupMemoryLow", + "DefaultMemoryMin"))) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Memory limit '%s' out of range, ignoring.", rvalue); + return 0; + } + } + + if (streq(lvalue, "DefaultMemoryLow")) { + c->default_memory_low = bytes; + c->default_memory_low_set = true; + } else if (streq(lvalue, "DefaultStartupMemoryLow")) { + c->default_startup_memory_low = bytes; + c->default_startup_memory_low_set = true; + } else if (streq(lvalue, "DefaultMemoryMin")) { + c->default_memory_min = bytes; + c->default_memory_min_set = true; + } else if (streq(lvalue, "MemoryMin")) { + c->memory_min = bytes; + c->memory_min_set = true; + } else if (streq(lvalue, "MemoryLow")) { + c->memory_low = bytes; + c->memory_low_set = true; + } else if (streq(lvalue, "StartupMemoryLow")) { + c->startup_memory_low = bytes; + c->startup_memory_low_set = true; + } else if (streq(lvalue, "MemoryHigh")) + c->memory_high = bytes; + else if (streq(lvalue, "StartupMemoryHigh")) { + c->startup_memory_high = bytes; + c->startup_memory_high_set = true; + } else if (streq(lvalue, "MemoryMax")) + c->memory_max = bytes; + else if (streq(lvalue, "StartupMemoryMax")) { + c->startup_memory_max = bytes; + c->startup_memory_max_set = true; + } else if (streq(lvalue, "MemorySwapMax")) + c->memory_swap_max = bytes; + else if (streq(lvalue, "StartupMemorySwapMax")) { + c->startup_memory_swap_max = bytes; + c->startup_memory_swap_max_set = true; + } else if (streq(lvalue, "MemoryZSwapMax")) + c->memory_zswap_max = bytes; + else if (streq(lvalue, "StartupMemoryZSwapMax")) { + c->startup_memory_zswap_max = bytes; + c->startup_memory_zswap_max_set = true; + } else if (streq(lvalue, "MemoryLimit")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit uses MemoryLimit=; please use MemoryMax= instead. Support for MemoryLimit= will be removed soon."); + c->memory_limit = bytes; + } else + return -EINVAL; + + return 0; +} + +int config_parse_tasks_max( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + CGroupTasksMax *tasks_max = data; + uint64_t v; + int r; + + if (isempty(rvalue)) { + *tasks_max = u ? u->manager->defaults.tasks_max : CGROUP_TASKS_MAX_UNSET; + return 0; + } + + if (streq(rvalue, "infinity")) { + *tasks_max = CGROUP_TASKS_MAX_UNSET; + return 0; + } + + r = parse_permyriad(rvalue); + if (r >= 0) + *tasks_max = (CGroupTasksMax) { r, 10000U }; /* r‱ */ + else { + r = safe_atou64(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue); + return 0; + } + + if (v <= 0 || v >= UINT64_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue); + return 0; + } + + *tasks_max = (CGroupTasksMax) { v }; + } + + return 0; +} + +int config_parse_delegate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_delegate) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Delegate= setting not supported for this unit type, ignoring."); + return 0; + } + + /* We either accept a boolean value, which may be used to turn on delegation for all controllers, or + * turn it off for all. Or it takes a list of controller names, in which case we add the specified + * controllers to the mask to delegate. Delegate= enables delegation without any controllers. */ + + if (isempty(rvalue)) { + /* An empty string resets controllers and sets Delegate=yes. */ + c->delegate = true; + c->delegate_controllers = 0; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + CGroupMask mask = 0; + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + CGroupController cc; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + cc = cgroup_controller_from_string(word); + if (cc < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid controller name '%s', ignoring", word); + continue; + } + + mask |= CGROUP_CONTROLLER_TO_MASK(cc); + } + + c->delegate = true; + c->delegate_controllers |= mask; + + } else if (r > 0) { + c->delegate = true; + c->delegate_controllers = CGROUP_MASK_DELEGATE; + } else { + c->delegate = false; + c->delegate_controllers = 0; + } + + return 0; +} + +int config_parse_delegate_subgroup( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = ASSERT_PTR(data); + UnitType t; + + t = unit_name_to_type(unit); + assert(t >= 0); + + if (!unit_vtable[t]->can_delegate) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring."); + return 0; + } + + if (isempty(rvalue)) { + c->delegate_subgroup = mfree(c->delegate_subgroup); + return 0; + } + + if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(&c->delegate_subgroup, rvalue); +} + +int config_parse_managed_oom_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ManagedOOMMode *mode = data, m; + UnitType t; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *mode = MANAGED_OOM_AUTO; + return 0; + } + + m = managed_oom_mode_from_string(rvalue); + if (m < 0) { + log_syntax(unit, LOG_WARNING, filename, line, m, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + *mode = m; + return 0; +} + +int config_parse_managed_oom_mem_pressure_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *limit = data; + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *limit = 0; + return 0; + } + + r = parse_permyriad(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse memory pressure limit value, ignoring: %s", rvalue); + return 0; + } + + /* Normalize to 2^32-1 == 100% */ + *limit = UINT32_SCALE_FROM_PERMYRIAD(r); + return 0; +} + +int config_parse_device_allow( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupDevicePermissions permissions; + CGroupContext *c = data; + const char *p = rvalue; + int r; + + if (isempty(rvalue)) { + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract device path and rights from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_path_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + if (!STARTSWITH_SET(resolved, "block-", "char-")) { + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (!valid_device_node_path(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device node path '%s', ignoring.", resolved); + return 0; + } + } + + permissions = isempty(p) ? 0 : cgroup_device_permissions_from_string(p); + if (permissions < 0) { + log_syntax(unit, LOG_WARNING, filename, line, permissions, "Invalid device rights '%s', ignoring.", p); + return 0; + } + + return cgroup_context_add_device_allow(c, resolved, permissions); +} + +int config_parse_io_device_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceWeight *w; + CGroupContext *c = data; + const char *p = ASSERT_PTR(rvalue); + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract device path and weight from '%s', ignoring.", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid device path or weight specified in '%s', ignoring.", rvalue); + return 0; + } + + r = unit_path_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cg_weight_parse(p, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "IO weight '%s' invalid, ignoring: %m", p); + return 0; + } + + assert(u != CGROUP_WEIGHT_INVALID); + + w = new0(CGroupIODeviceWeight, 1); + if (!w) + return log_oom(); + + w->path = TAKE_PTR(resolved); + w->weight = u; + + LIST_PREPEND(device_weights, c->io_device_weights, w); + return 0; +} + +int config_parse_io_device_latency( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceLatency *l; + CGroupContext *c = data; + const char *p = ASSERT_PTR(rvalue); + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract device path and latency from '%s', ignoring.", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid device path or latency specified in '%s', ignoring.", rvalue); + return 0; + } + + r = unit_path_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = parse_sec(p, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", p); + return 0; + } + + l = new0(CGroupIODeviceLatency, 1); + if (!l) + return log_oom(); + + l->path = TAKE_PTR(resolved); + l->target_usec = usec; + + LIST_PREPEND(device_latencies, c->io_device_latencies, l); + return 0; +} + +int config_parse_io_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceLimit *l = NULL; + CGroupContext *c = data; + CGroupIOLimitType type; + const char *p = ASSERT_PTR(rvalue); + uint64_t num; + int r; + + assert(filename); + assert(lvalue); + + type = cgroup_io_limit_type_from_string(lvalue); + assert(type >= 0); + + if (isempty(rvalue)) { + LIST_FOREACH(device_limits, t, c->io_device_limits) + t->limits[type] = cgroup_io_limit_defaults[type]; + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue); + return 0; + } + + r = unit_path_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (streq("infinity", p)) + num = CGROUP_LIMIT_MAX; + else { + r = parse_size(p, 1000, &num); + if (r < 0 || num <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid IO limit '%s', ignoring.", p); + return 0; + } + } + + LIST_FOREACH(device_limits, t, c->io_device_limits) + if (path_equal(resolved, t->path)) { + l = t; + break; + } + + if (!l) { + l = new0(CGroupIODeviceLimit, 1); + if (!l) + return log_oom(); + + l->path = TAKE_PTR(resolved); + for (CGroupIOLimitType i = 0; i < _CGROUP_IO_LIMIT_TYPE_MAX; i++) + l->limits[i] = cgroup_io_limit_defaults[i]; + + LIST_PREPEND(device_limits, c->io_device_limits, l); + } + + l->limits[type] = num; + + return 0; +} + +int config_parse_blockio_device_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupBlockIODeviceWeight *w; + CGroupContext *c = data; + const char *p = ASSERT_PTR(rvalue); + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.", + lvalue, lvalue); + + if (isempty(rvalue)) { + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract device node and weight from '%s', ignoring.", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid device node or weight specified in '%s', ignoring.", rvalue); + return 0; + } + + r = unit_path_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cg_blkio_weight_parse(p, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid block IO weight '%s', ignoring: %m", p); + return 0; + } + + assert(u != CGROUP_BLKIO_WEIGHT_INVALID); + + w = new0(CGroupBlockIODeviceWeight, 1); + if (!w) + return log_oom(); + + w->path = TAKE_PTR(resolved); + w->weight = u; + + LIST_PREPEND(device_weights, c->blockio_device_weights, w); + return 0; +} + +int config_parse_blockio_bandwidth( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupBlockIODeviceBandwidth *b = NULL; + CGroupContext *c = data; + const char *p = ASSERT_PTR(rvalue); + uint64_t bytes; + bool read; + int r; + + assert(filename); + assert(lvalue); + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.", + lvalue, lvalue); + + read = streq("BlockIOReadBandwidth", lvalue); + + if (isempty(rvalue)) { + LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) { + t->rbps = CGROUP_LIMIT_MAX; + t->wbps = CGROUP_LIMIT_MAX; + } + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue); + return 0; + } + + r = unit_path_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = parse_size(p, 1000, &bytes); + if (r < 0 || bytes <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid Block IO Bandwidth '%s', ignoring.", p); + return 0; + } + + LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) + if (path_equal(resolved, t->path)) { + b = t; + break; + } + + if (!b) { + b = new0(CGroupBlockIODeviceBandwidth, 1); + if (!b) + return log_oom(); + + b->path = TAKE_PTR(resolved); + b->rbps = CGROUP_LIMIT_MAX; + b->wbps = CGROUP_LIMIT_MAX; + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, b); + } + + if (read) + b->rbps = bytes; + else + b->wbps = bytes; + + return 0; +} + +int config_parse_job_mode_isolate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + JobMode *m = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean, ignoring: %s", rvalue); + return 0; + } + + log_notice("%s is deprecated. Please use OnFailureJobMode= instead", lvalue); + + *m = r ? JOB_ISOLATE : JOB_REPLACE; + return 0; +} + +int config_parse_exec_directories( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecDirectory *ed = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + exec_directory_done(ed); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *tuple = NULL; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + _cleanup_free_ char *src = NULL, *dest = NULL; + const char *q = tuple; + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax in %s=, ignoring: %s", lvalue, tuple); + return 0; + } + + _cleanup_free_ char *sresolved = NULL; + r = unit_path_printf(u, src, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", src); + continue; + } + + r = path_simplify_and_warn(sresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue); + if (r < 0) + continue; + + if (path_startswith(sresolved, "private")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s= path can't be 'private', ignoring assignment: %s", lvalue, tuple); + continue; + } + + /* For State and Runtime directories we support an optional destination parameter, which + * will be used to create a symlink to the source. */ + _cleanup_free_ char *dresolved = NULL; + if (!isempty(dest)) { + if (streq(lvalue, "ConfigurationDirectory")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Destination parameter is not supported for ConfigurationDirectory, ignoring: %s", tuple); + continue; + } + + r = unit_path_printf(u, dest, &dresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", dest); + continue; + } + + r = path_simplify_and_warn(dresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue); + if (r < 0) + continue; + } + + r = exec_directory_add(ed, sresolved, dresolved); + if (r < 0) + return log_oom(); + } +} + +int config_parse_set_credential( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *word = NULL, *k = NULL; + _cleanup_free_ void *d = NULL; + ExecContext *context = ASSERT_PTR(data); + ExecSetCredential *old; + Unit *u = userdata; + bool encrypted = ltype; + const char *p = ASSERT_PTR(rvalue); + size_t size; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + context->set_credentials = hashmap_free(context->set_credentials); + return 0; + } + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract credential name, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_cred_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); + return 0; + } + if (!credential_name_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k); + return 0; + } + + if (encrypted) { + r = unbase64mem_full(p, SIZE_MAX, true, &d, &size); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Encrypted credential data not valid Base64 data, ignoring."); + return 0; + } + } else { + char *unescaped; + ssize_t l; + + /* We support escape codes here, so that users can insert trailing \n if they like */ + l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped); + if (l < 0) { + log_syntax(unit, LOG_WARNING, filename, line, l, "Can't unescape \"%s\", ignoring: %m", p); + return 0; + } + + d = unescaped; + size = l; + } + + old = hashmap_get(context->set_credentials, k); + if (old) { + free_and_replace(old->data, d); + old->size = size; + old->encrypted = encrypted; + } else { + _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL; + + sc = new(ExecSetCredential, 1); + if (!sc) + return log_oom(); + + *sc = (ExecSetCredential) { + .id = TAKE_PTR(k), + .data = TAKE_PTR(d), + .size = size, + .encrypted = encrypted, + }; + + r = hashmap_ensure_put(&context->set_credentials, &exec_set_credential_hash_ops, sc->id, sc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Duplicated credential value '%s', ignoring assignment: %s", sc->id, rvalue); + return 0; + } + + TAKE_PTR(sc); + } + + return 0; +} + +int hashmap_put_credential(Hashmap **h, const char *id, const char *path, bool encrypted) { + ExecLoadCredential *old; + int r; + + assert(h); + assert(id); + assert(path); + + old = hashmap_get(*h, id); + if (old) { + r = free_and_strdup(&old->path, path); + if (r < 0) + return r; + + old->encrypted = encrypted; + } else { + _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL; + + lc = new(ExecLoadCredential, 1); + if (!lc) + return log_oom(); + + *lc = (ExecLoadCredential) { + .id = strdup(id), + .path = strdup(path), + .encrypted = encrypted, + }; + if (!lc->id || !lc->path) + return -ENOMEM; + + r = hashmap_ensure_put(h, &exec_load_credential_hash_ops, lc->id, lc); + if (r < 0) + return r; + + TAKE_PTR(lc); + } + + return 0; +} + +int config_parse_load_credential( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL; + ExecContext *context = ASSERT_PTR(data); + bool encrypted = ltype; + Unit *u = userdata; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + context->load_credentials = hashmap_free(context->load_credentials); + return 0; + } + + p = rvalue; + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_cred_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); + return 0; + } + if (!credential_name_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k); + return 0; + } + + if (isempty(p)) { + /* If only one field is specified take it as shortcut for inheriting a credential named + * the same way from our parent */ + q = strdup(k); + if (!q) + return log_oom(); + } else { + r = unit_path_printf(u, p, &q); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p); + return 0; + } + if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", q); + return 0; + } + } + + r = hashmap_put_credential(&context->load_credentials, k, q, encrypted); + if (r < 0) + return log_error_errno(r, "Failed to store load credential '%s': %m", rvalue); + + return 0; +} + +int config_parse_import_credential( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *s = NULL; + Set** import_credentials = ASSERT_PTR(data); + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *import_credentials = set_free_free(*import_credentials); + return 0; + } + + r = unit_cred_printf(u, rvalue, &s); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s); + return 0; + } + if (!credential_glob_valid(s)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name or glob \"%s\" not valid, ignoring.", s); + return 0; + } + + r = set_put_strdup(import_credentials, s); + if (r < 0) + return log_error_errno(r, "Failed to store credential name '%s': %m", rvalue); + + return 0; +} + +int config_parse_set_status( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExitStatusSet *status_set = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* Empty assignment resets the list */ + if (isempty(rvalue)) { + exit_status_set_free(status_set); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + Bitmap *bitmap; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=%s, ignoring: %m", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + /* We need to call exit_status_from_string() first, because we want + * to parse numbers as exit statuses, not signals. */ + + r = exit_status_from_string(word); + if (r >= 0) { + assert(r >= 0 && r < 256); + bitmap = &status_set->status; + } else { + r = signal_from_string(word); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse value, ignoring: %s", word); + continue; + } + bitmap = &status_set->signal; + } + + r = bitmap_set(bitmap, r); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to set signal or status %s, ignoring: %m", word); + } +} + +int config_parse_namespace_path_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + char*** sv = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *sv = strv_free(*sv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL; + const char *w; + bool ignore_enoent = false, shall_prefix = false; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + w = word; + if (startswith(w, "-")) { + ignore_enoent = true; + w++; + } + if (startswith(w, "+")) { + shall_prefix = true; + w++; + } + + r = unit_path_printf(u, w, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", w); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + joined = strjoin(ignore_enoent ? "-" : "", + shall_prefix ? "+" : "", + resolved); + + r = strv_push(sv, joined); + if (r < 0) + return log_oom(); + + joined = NULL; + } + + return 0; +} + +int config_parse_temporary_filesystems( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + ExecContext *c = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *path = NULL, *resolved = NULL; + const char *w; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + w = word; + r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", word); + continue; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", word); + continue; + } + + r = unit_path_printf(u, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", path); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, resolved, w); + if (r < 0) + return log_oom(); + } +} + +int config_parse_bind_paths( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *source = NULL, *destination = NULL; + _cleanup_free_ char *sresolved = NULL, *dresolved = NULL; + char *s = NULL, *d = NULL; + bool rbind = true, ignore_enoent = false; + + r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + if (r == 0) + break; + + r = unit_full_printf_full(u, source, PATH_MAX, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", source); + continue; + } + + s = sresolved; + if (s[0] == '-') { + ignore_enoent = true; + s++; + } + + r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + /* Optionally, the destination is specified. */ + if (p && p[-1] == ':') { + r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing argument after ':', ignoring: %s", s); + continue; + } + + r = unit_path_printf(u, destination, &dresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in \"%s\", ignoring: %m", destination); + continue; + } + + r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + d = dresolved; + + /* Optionally, there's also a short option string specified */ + if (p && p[-1] == ':') { + _cleanup_free_ char *options = NULL; + + r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + if (isempty(options) || streq(options, "rbind")) + rbind = true; + else if (streq(options, "norbind")) + rbind = false; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid option string, ignoring setting: %s", options); + continue; + } + } + } else + d = s; + + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = s, + .destination = d, + .read_only = !!strstr(lvalue, "ReadOnly"), + .recursive = rbind, + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int config_parse_mount_images( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL; + _cleanup_free_ char *sresolved = NULL, *dresolved = NULL; + const char *q = NULL; + char *s = NULL; + bool permissive = false; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + q = tuple; + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax in %s=, ignoring: %s", lvalue, tuple); + return 0; + } + if (r == 0) + continue; + + s = first; + if (s[0] == '-') { + permissive = true; + s++; + } + + r = unit_path_printf(u, s, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s); + continue; + } + + r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + if (isempty(second)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing destination in %s, ignoring: %s", lvalue, rvalue); + continue; + } + + r = unit_path_printf(u, second, &dresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in \"%s\", ignoring: %m", second); + continue; + } + + r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + for (;;) { + _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL; + MountOptions *o = NULL; + PartitionDesignator partition_designator; + + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q); + return 0; + } + if (r == 0) + break; + /* Single set of options, applying to the root partition/single filesystem */ + if (r == 1) { + r = unit_full_printf(u, partition, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", first); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = PARTITION_ROOT, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, o); + + break; + } + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) { + log_syntax(unit, LOG_WARNING, filename, line, partition_designator, + "Invalid partition name %s, ignoring", partition); + continue; + } + r = unit_full_printf(u, mount_options, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, o); + } + + r = mount_image_add(&c->mount_images, &c->n_mount_images, + &(MountImage) { + .source = sresolved, + .destination = dresolved, + .mount_options = options, + .ignore_enoent = permissive, + .type = MOUNT_IMAGE_DISCRETE, + }); + if (r < 0) + return log_oom(); + } +} + +int config_parse_extension_images( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *source = NULL, *tuple = NULL, *sresolved = NULL; + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + bool permissive = false; + const char *q = NULL; + char *s = NULL; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + q = tuple; + r = extract_first_word(&q, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax in %s=, ignoring: %s", lvalue, tuple); + return 0; + } + if (r == 0) + continue; + + s = source; + if (s[0] == '-') { + permissive = true; + s++; + } + + r = unit_path_printf(u, s, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s); + continue; + } + + r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + for (;;) { + _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL; + MountOptions *o = NULL; + PartitionDesignator partition_designator; + + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q); + return 0; + } + if (r == 0) + break; + /* Single set of options, applying to the root partition/single filesystem */ + if (r == 1) { + r = unit_full_printf(u, partition, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", partition); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = PARTITION_ROOT, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, o); + + break; + } + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid partition name %s, ignoring", partition); + continue; + } + r = unit_full_printf(u, mount_options, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, o); + } + + r = mount_image_add(&c->extension_images, &c->n_extension_images, + &(MountImage) { + .source = sresolved, + .mount_options = options, + .ignore_enoent = permissive, + .type = MOUNT_IMAGE_EXTENSION, + }); + if (r < 0) + return log_oom(); + } +} + +int config_parse_job_timeout_sec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = ASSERT_PTR(data); + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobTimeoutSec= parameter, ignoring: %s", rvalue); + return 0; + } + + /* If the user explicitly changed JobTimeoutSec= also change JobRunningTimeoutSec=, for compatibility with old + * versions. If JobRunningTimeoutSec= was explicitly set, avoid this however as whatever the user picked should + * count. */ + + if (!u->job_running_timeout_set) + u->job_running_timeout = usec; + + u->job_timeout = usec; + + return 0; +} + +int config_parse_job_running_timeout_sec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = ASSERT_PTR(data); + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobRunningTimeoutSec= parameter, ignoring: %s", rvalue); + return 0; + } + + u->job_running_timeout = usec; + u->job_running_timeout_set = true; + + return 0; +} + +int config_parse_emergency_action( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + EmergencyAction *x = ASSERT_PTR(data); + RuntimeScope runtime_scope; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* If we have a unit determine the scope based on it */ + if (unit) + runtime_scope = ((Unit*) ASSERT_PTR(userdata))->manager->runtime_scope; + else + runtime_scope = ltype; /* otherwise, assume the scope is passed in via ltype */ + + r = parse_emergency_action(rvalue, runtime_scope, x); + if (r < 0) { + if (r == -EOPNOTSUPP) + log_syntax(unit, LOG_WARNING, filename, line, r, + "%s= specified as %s mode action, ignoring: %s", + lvalue, runtime_scope_to_string(runtime_scope), rvalue); + else + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_pid_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL, *n = NULL; + const Unit *u = ASSERT_PTR(userdata); + char **s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* An empty assignment removes already set value. */ + *s = mfree(*s); + return 0; + } + + r = unit_path_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + /* If this is a relative path make it absolute by prefixing the /run */ + n = path_make_absolute(k, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!n) + return log_oom(); + + /* Check that the result is a sensible path */ + r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return r; + + r = patch_var_run(unit, filename, line, lvalue, &n); + if (r < 0) + return r; + + free_and_replace(*s, n); + return 0; +} + +int config_parse_exit_status( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *exit_status = data, r; + uint8_t u; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(exit_status); + + if (isempty(rvalue)) { + *exit_status = -1; + return 0; + } + + r = safe_atou8(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse exit status '%s', ignoring: %m", rvalue); + return 0; + } + + *exit_status = u; + return 0; +} + +int config_parse_disable_controllers( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + CGroupContext *c = data; + CGroupMask disabled_mask; + + /* 1. If empty, make all controllers eligible for use again. + * 2. If non-empty, merge all listed controllers, space separated. */ + + if (isempty(rvalue)) { + c->disable_controllers = 0; + return 0; + } + + r = cg_mask_from_string(rvalue, &disabled_mask); + if (r < 0 || disabled_mask <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue); + return 0; + } + + c->disable_controllers |= disabled_mask; + + return 0; +} + +int config_parse_ip_filter_bpf_progs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + const Unit *u = userdata; + char ***paths = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *paths = strv_free(*paths); + return 0; + } + + r = unit_path_printf(u, rvalue, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (strv_contains(*paths, resolved)) + return 0; + + r = strv_extend(paths, resolved); + if (r < 0) + return log_oom(); + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + static bool warned = false; + + log_full(warned ? LOG_DEBUG : LOG_WARNING, + "File %s:%u configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters.\n" + "Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue); + + warned = true; + } + + return 0; +} + +int config_parse_bpf_foreign_program( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + _cleanup_free_ char *resolved = NULL, *word = NULL; + CGroupContext *c = data; + const char *p = ASSERT_PTR(rvalue); + Unit *u = userdata; + int attach_type, r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + while (c->bpf_foreign_programs) + cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs); + + return 0; + } + + r = extract_first_word(&p, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse foreign BPF program, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax in %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + attach_type = bpf_cgroup_attach_type_from_string(word); + if (attach_type < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown BPF attach type=%s, ignoring: %s", word, rvalue); + return 0; + } + + r = unit_path_printf(u, p, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %s", p, rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cgroup_context_add_bpf_foreign_program(c, attach_type, resolved); + if (r < 0) + return log_error_errno(r, "Failed to add foreign BPF program to cgroup context: %m"); + + return 0; +} + +int config_parse_cgroup_socket_bind( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + _cleanup_free_ CGroupSocketBindItem *item = NULL; + CGroupSocketBindItem **head = data; + uint16_t nr_ports, port_min; + int af, ip_protocol, r; + + if (isempty(rvalue)) { + cgroup_context_remove_socket_bind(head); + return 0; + } + + r = parse_socket_bind_item(rvalue, &af, &ip_protocol, &nr_ports, &port_min); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Unable to parse %s= assignment, ignoring: %s", lvalue, rvalue); + return 0; + } + + item = new(CGroupSocketBindItem, 1); + if (!item) + return log_oom(); + *item = (CGroupSocketBindItem) { + .address_family = af, + .ip_protocol = ip_protocol, + .nr_ports = nr_ports, + .port_min = port_min, + }; + + LIST_PREPEND(socket_bind_items, *head, TAKE_PTR(item)); + + return 0; +} + +int config_parse_restrict_network_interfaces( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + CGroupContext *c = ASSERT_PTR(data); + bool is_allow_rule = true; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces); + return 0; + } + + if (rvalue[0] == '~') { + is_allow_rule = false; + rvalue++; + } + + if (set_isempty(c->restrict_network_interfaces)) + /* Only initialize this when creating the set */ + c->restrict_network_interfaces_is_allow_list = is_allow_rule; + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + + if (!ifname_valid(word)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", word); + continue; + } + + if (c->restrict_network_interfaces_is_allow_list != is_allow_rule) + free(set_remove(c->restrict_network_interfaces, word)); + else { + r = set_put_strdup(&c->restrict_network_interfaces, word); + if (r < 0) + return log_oom(); + } + } + + return 0; +} + +static int merge_by_names(Unit *u, Set *names, const char *id) { + char *k; + int r; + + assert(u); + + /* Let's try to add in all names that are aliases of this unit */ + while ((k = set_steal_first(names))) { + _cleanup_free_ _unused_ char *free_k = k; + + /* First try to merge in the other name into our unit */ + r = unit_merge_by_name(u, k); + if (r < 0) { + Unit *other; + + /* Hmm, we couldn't merge the other unit into ours? Then let's try it the other way + * round. */ + + other = manager_get_unit(u->manager, k); + if (!other) + return r; /* return previous failure */ + + r = unit_merge(other, u); + if (r < 0) + return r; + + return merge_by_names(other, names, NULL); + } + + if (streq_ptr(id, k)) + unit_choose_id(u, id); + } + + return 0; +} + +int unit_load_fragment(Unit *u) { + const char *fragment; + _cleanup_set_free_free_ Set *names = NULL; + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + assert(u->id); + + if (u->transient) { + u->access_selinux_context = mfree(u->access_selinux_context); + u->load_state = UNIT_LOADED; + return 0; + } + + /* Possibly rebuild the fragment map to catch new units */ + r = unit_file_build_name_map(&u->manager->lookup_paths, + &u->manager->unit_cache_timestamp_hash, + &u->manager->unit_id_map, + &u->manager->unit_name_map, + &u->manager->unit_path_cache); + if (r < 0) + return log_error_errno(r, "Failed to rebuild name map: %m"); + + r = unit_file_find_fragment(u->manager->unit_id_map, + u->manager->unit_name_map, + u->id, + &fragment, + &names); + if (r < 0 && r != -ENOENT) + return r; + + if (fragment) { + /* Open the file, check if this is a mask, otherwise read. */ + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + + /* Try to open the file name. A symlink is OK, for example for linked files or masks. We + * expect that all symlinks within the lookup paths have been already resolved, but we don't + * verify this here. */ + f = fopen(fragment, "re"); + if (!f) + return log_unit_notice_errno(u, errno, "Failed to open %s: %m", fragment); + + if (fstat(fileno(f), &st) < 0) + return -errno; + + r = free_and_strdup(&u->fragment_path, fragment); + if (r < 0) + return r; + + if (null_or_empty(&st)) { + /* Unit file is masked */ + + u->load_state = u->perpetual ? UNIT_LOADED : UNIT_MASKED; /* don't allow perpetual units to ever be masked */ + u->fragment_mtime = 0; + u->access_selinux_context = mfree(u->access_selinux_context); + } else { +#if HAVE_SELINUX + if (mac_selinux_use()) { + _cleanup_freecon_ char *selcon = NULL; + + /* Cache the SELinux context of the unit file here. We'll make use of when checking access permissions to loaded units */ + r = fgetfilecon_raw(fileno(f), &selcon); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to read SELinux context of '%s', ignoring: %m", fragment); + + r = free_and_strdup(&u->access_selinux_context, selcon); + if (r < 0) + return r; + } else +#endif + u->access_selinux_context = mfree(u->access_selinux_context); + + u->load_state = UNIT_LOADED; + u->fragment_mtime = timespec_load(&st.st_mtim); + + /* Now, parse the file contents */ + r = config_parse(u->id, fragment, f, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + 0, + u, + NULL); + if (r == -ENOEXEC) + log_unit_notice_errno(u, r, "Unit configuration has fatal error, unit will not be started."); + if (r < 0) + return r; + } + } + + /* Call merge_by_names with the name derived from the fragment path as the preferred name. + * + * We do the merge dance here because for some unit types, the unit might have aliases which are not + * declared in the file system. In particular, this is true (and frequent) for device and swap units. + */ + const char *id = u->id; + _cleanup_free_ char *filename = NULL, *free_id = NULL; + + if (fragment) { + r = path_extract_filename(fragment, &filename); + if (r < 0) + return log_debug_errno(r, "Failed to extract filename from fragment '%s': %m", fragment); + id = filename; + + if (unit_name_is_valid(id, UNIT_NAME_TEMPLATE)) { + assert(u->instance); /* If we're not trying to use a template for non-instanced unit, + * this must be set. */ + + r = unit_name_replace_instance(id, u->instance, &free_id); + if (r < 0) + return log_debug_errno(r, "Failed to build id (%s + %s): %m", id, u->instance); + id = free_id; + } + } + + return merge_by_names(u, names, id); +} + +void unit_dump_config_items(FILE *f) { + static const struct { + const ConfigParserCallback callback; + const char *rvalue; + } table[] = { + { config_parse_warn_compat, "NOTSUPPORTED" }, + { config_parse_int, "INTEGER" }, + { config_parse_unsigned, "UNSIGNED" }, + { config_parse_iec_size, "SIZE" }, + { config_parse_iec_uint64, "SIZE" }, + { config_parse_si_uint64, "SIZE" }, + { config_parse_bool, "BOOLEAN" }, + { config_parse_string, "STRING" }, + { config_parse_path, "PATH" }, + { config_parse_unit_path_printf, "PATH" }, + { config_parse_colon_separated_paths, "PATH" }, + { config_parse_strv, "STRING [...]" }, + { config_parse_exec_nice, "NICE" }, + { config_parse_exec_oom_score_adjust, "OOMSCOREADJUST" }, + { config_parse_exec_io_class, "IOCLASS" }, + { config_parse_exec_io_priority, "IOPRIORITY" }, + { config_parse_exec_cpu_sched_policy, "CPUSCHEDPOLICY" }, + { config_parse_exec_cpu_sched_prio, "CPUSCHEDPRIO" }, + { config_parse_exec_cpu_affinity, "CPUAFFINITY" }, + { config_parse_mode, "MODE" }, + { config_parse_unit_env_file, "FILE" }, + { config_parse_exec_output, "OUTPUT" }, + { config_parse_exec_input, "INPUT" }, + { config_parse_log_facility, "FACILITY" }, + { config_parse_log_level, "LEVEL" }, + { config_parse_exec_secure_bits, "SECUREBITS" }, + { config_parse_capability_set, "BOUNDINGSET" }, + { config_parse_rlimit, "LIMIT" }, + { config_parse_unit_deps, "UNIT [...]" }, + { config_parse_exec, "PATH [ARGUMENT [...]]" }, + { config_parse_service_type, "SERVICETYPE" }, + { config_parse_service_exit_type, "SERVICEEXITTYPE" }, + { config_parse_service_restart, "SERVICERESTART" }, + { config_parse_service_restart_mode, "SERVICERESTARTMODE" }, + { config_parse_service_timeout_failure_mode, "TIMEOUTMODE" }, + { config_parse_kill_mode, "KILLMODE" }, + { config_parse_signal, "SIGNAL" }, + { config_parse_socket_listen, "SOCKET [...]" }, + { config_parse_socket_bind, "SOCKETBIND" }, + { config_parse_socket_bindtodevice, "NETWORKINTERFACE" }, + { config_parse_sec, "SECONDS" }, + { config_parse_nsec, "NANOSECONDS" }, + { config_parse_namespace_path_strv, "PATH [...]" }, + { config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" }, + { config_parse_unit_requires_mounts_for, + "PATH [...]" }, + { config_parse_exec_mount_propagation_flag, + "MOUNTFLAG" }, + { config_parse_unit_string_printf, "STRING" }, + { config_parse_trigger_unit, "UNIT" }, + { config_parse_timer, "TIMER" }, + { config_parse_path_spec, "PATH" }, + { config_parse_notify_access, "ACCESS" }, + { config_parse_ip_tos, "TOS" }, + { config_parse_unit_condition_path, "CONDITION" }, + { config_parse_unit_condition_string, "CONDITION" }, + { config_parse_unit_slice, "SLICE" }, + { config_parse_documentation, "URL" }, + { config_parse_service_timeout, "SECONDS" }, + { config_parse_emergency_action, "ACTION" }, + { config_parse_set_status, "STATUS" }, + { config_parse_service_sockets, "SOCKETS" }, + { config_parse_environ, "ENVIRON" }, +#if HAVE_SECCOMP + { config_parse_syscall_filter, "SYSCALLS" }, + { config_parse_syscall_archs, "ARCHS" }, + { config_parse_syscall_errno, "ERRNO" }, + { config_parse_syscall_log, "SYSCALLS" }, + { config_parse_address_families, "FAMILIES" }, + { config_parse_restrict_namespaces, "NAMESPACES" }, +#endif + { config_parse_restrict_filesystems, "FILESYSTEMS" }, + { config_parse_cpu_shares, "SHARES" }, + { config_parse_cg_weight, "WEIGHT" }, + { config_parse_cg_cpu_weight, "CPUWEIGHT" }, + { config_parse_memory_limit, "LIMIT" }, + { config_parse_device_allow, "DEVICE" }, + { config_parse_device_policy, "POLICY" }, + { config_parse_io_limit, "LIMIT" }, + { config_parse_io_device_weight, "DEVICEWEIGHT" }, + { config_parse_io_device_latency, "DEVICELATENCY" }, + { config_parse_blockio_bandwidth, "BANDWIDTH" }, + { config_parse_blockio_weight, "WEIGHT" }, + { config_parse_blockio_device_weight, "DEVICEWEIGHT" }, + { config_parse_long, "LONG" }, + { config_parse_socket_service, "SERVICE" }, +#if HAVE_SELINUX + { config_parse_exec_selinux_context, "LABEL" }, +#endif + { config_parse_job_mode, "MODE" }, + { config_parse_job_mode_isolate, "BOOLEAN" }, + { config_parse_personality, "PERSONALITY" }, + { config_parse_log_filter_patterns, "REGEX" }, + }; + + const char *prev = NULL; + + assert(f); + + NULSTR_FOREACH(i, load_fragment_gperf_nulstr) { + const char *rvalue = "OTHER", *lvalue; + const ConfigPerfItem *p; + const char *dot; + + assert_se(p = load_fragment_gperf_lookup(i, strlen(i))); + + /* Hide legacy settings */ + if (p->parse == config_parse_warn_compat && + p->ltype == DISABLED_LEGACY) + continue; + + for (size_t j = 0; j < ELEMENTSOF(table); j++) + if (p->parse == table[j].callback) { + rvalue = table[j].rvalue; + break; + } + + dot = strchr(i, '.'); + lvalue = dot ? dot + 1 : i; + + if (dot) { + size_t prefix_len = dot - i; + + if (!prev || !strneq(prev, i, prefix_len+1)) { + if (prev) + fputc('\n', f); + + fprintf(f, "[%.*s]\n", (int) prefix_len, i); + } + } + + fprintf(f, "%s=%s\n", lvalue, rvalue); + prev = i; + } +} + +int config_parse_cpu_affinity2( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CPUSet *affinity = ASSERT_PTR(data); + + (void) parse_cpu_set_extend(rvalue, affinity, true, unit, filename, line, lvalue); + + return 0; +} + +int config_parse_show_status( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int k; + ShowStatus *b = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + k = parse_show_status(rvalue, b); + if (k < 0) + log_syntax(unit, LOG_WARNING, filename, line, k, "Failed to parse show status setting, ignoring: %s", rvalue); + + return 0; +} + +int config_parse_output_restricted( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecOutput t, *eo = ASSERT_PTR(data); + bool obsolete = false; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (streq(rvalue, "syslog")) { + t = EXEC_OUTPUT_JOURNAL; + obsolete = true; + } else if (streq(rvalue, "syslog+console")) { + t = EXEC_OUTPUT_JOURNAL_AND_CONSOLE; + obsolete = true; + } else { + t = exec_output_from_string(rvalue); + if (t < 0) { + log_syntax(unit, LOG_WARNING, filename, line, t, "Failed to parse output type, ignoring: %s", rvalue); + return 0; + } + + if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Standard output types socket, fd:, file:, append:, truncate: are not supported as defaults, ignoring: %s", rvalue); + return 0; + } + } + + if (obsolete) + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "Standard output type %s is obsolete, automatically updating to %s. Please update your configuration.", + rvalue, exec_output_to_string(t)); + + *eo = t; + return 0; +} + +int config_parse_crash_chvt( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = parse_crash_chvt(rvalue, data); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashChangeVT= setting, ignoring: %s", rvalue); + + return 0; +} + +int config_parse_swap_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Swap *s = ASSERT_PTR(userdata); + int r, priority; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + s->parameters_fragment.priority = -1; + s->parameters_fragment.priority_set = false; + return 0; + } + + r = safe_atoi(rvalue, &priority); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid swap priority '%s', ignoring.", rvalue); + return 0; + } + + if (priority < -1) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Sorry, swap priorities smaller than -1 may only be assigned by the kernel itself, ignoring: %s", rvalue); + return 0; + } + + if (priority > 32767) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Swap priority out of range, ignoring: %s", rvalue); + return 0; + } + + s->parameters_fragment.priority = priority; + s->parameters_fragment.priority_set = true; + return 0; +} + +int config_parse_watchdog_sec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t *usec = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* This is called for {Runtime,Reboot,KExec}WatchdogSec= where "default" maps to + * USEC_INFINITY internally. */ + + if (streq(rvalue, "default")) + *usec = USEC_INFINITY; + else if (streq(rvalue, "off")) + *usec = 0; + else + return config_parse_sec(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); + + return 0; +} + +int config_parse_tty_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + unsigned *sz = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *sz = UINT_MAX; + return 0; + } + + return config_parse_unsigned(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_log_filter_patterns( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + const char *pattern = ASSERT_PTR(rvalue); + bool is_allowlist = true; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(pattern)) { + /* Empty assignment resets the lists. */ + c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns); + c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns); + return 0; + } + + if (pattern[0] == '~') { + is_allowlist = false; + pattern++; + if (isempty(pattern)) + /* LogFilterPatterns=~ is not considered a valid pattern. */ + return log_syntax(unit, LOG_WARNING, filename, line, 0, + "Regex pattern invalid, ignoring: %s=%s", lvalue, rvalue); + } + + if (pattern_compile_and_log(pattern, 0, NULL) < 0) + return 0; + + r = set_put_strdup(is_allowlist ? &c->log_filter_allowed_patterns : &c->log_filter_denied_patterns, + pattern); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store log filtering pattern, ignoring: %s=%s", lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_open_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(open_file_freep) OpenFile *of = NULL; + OpenFile **head = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + open_file_free_many(head); + return 0; + } + + r = open_file_parse(rvalue, &of); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse OpenFile= setting, ignoring: %s", rvalue); + return 0; + } + + LIST_APPEND(open_files, *head, TAKE_PTR(of)); + + return 0; +} + +int config_parse_cgroup_nft_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = ASSERT_PTR(data); + Unit *u = ASSERT_PTR(userdata); + + return config_parse_nft_set(unit, filename, line, section, section_line, lvalue, ltype, rvalue, &c->nft_set_context, u); +} diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h new file mode 100644 index 0000000..6919805 --- /dev/null +++ b/src/core/load-fragment.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "unit.h" + +/* These functions are declared in the header to make them accessible to unit tests. */ +bool contains_instance_specifier_superset(const char *s); +int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format); + +/* Config-parsing helpers relevant only for sources under src/core/ */ +int parse_crash_chvt(const char *value, int *data); +int parse_confirm_spawn(const char *value, char **console); + +int hashmap_put_credential(Hashmap **h, const char *id, const char *path, bool encrypted); + +/* Read service data from .desktop file style configuration fragments */ + +int unit_load_fragment(Unit *u); + +void unit_dump_config_items(FILE *f); + +CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps); +CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_colon_separated_paths); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_strv_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_documentation); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_listen); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_protocol); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_bind); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_coredump_filter); +CONFIG_PARSER_PROTOTYPE(config_parse_exec); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_abort); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_failure_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_service_type); +CONFIG_PARSER_PROTOTYPE(config_parse_service_exit_type); +CONFIG_PARSER_PROTOTYPE(config_parse_service_restart); +CONFIG_PARSER_PROTOTYPE(config_parse_service_restart_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_output); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_text); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_data); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_class); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_prio); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_affinity); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_apivfs); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_secure_bits); +CONFIG_PARSER_PROTOTYPE(config_parse_root_image_options); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash_sig); +CONFIG_PARSER_PROTOTYPE(config_parse_capability_set); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_propagation_flag); +CONFIG_PARSER_PROTOTYPE(config_parse_timer); +CONFIG_PARSER_PROTOTYPE(config_parse_trigger_unit); +CONFIG_PARSER_PROTOTYPE(config_parse_path_spec); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_service); +CONFIG_PARSER_PROTOTYPE(config_parse_service_sockets); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_env_file); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_tos); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_path); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string); +CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_notify_access); +CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_log); +CONFIG_PARSER_PROTOTYPE(config_parse_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_pass_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_unset_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_slice); +CONFIG_PARSER_PROTOTYPE(config_parse_cg_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_cg_cpu_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares); +CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); +CONFIG_PARSER_PROTOTYPE(config_parse_delegate); +CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference); +CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); +CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency); +CONFIG_PARSER_PROTOTYPE(config_parse_io_device_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_io_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_device_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_bandwidth); +CONFIG_PARSER_PROTOTYPE(config_parse_job_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_job_mode_isolate); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_selinux_context); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_apparmor_profile); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_smack_process_label); +CONFIG_PARSER_PROTOTYPE(config_parse_address_families); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_preserve_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_directories); +CONFIG_PARSER_PROTOTYPE(config_parse_set_credential); +CONFIG_PARSER_PROTOTYPE(config_parse_load_credential); +CONFIG_PARSER_PROTOTYPE(config_parse_import_credential); +CONFIG_PARSER_PROTOTYPE(config_parse_set_status); +CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); +CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); +CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_home); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_system); +CONFIG_PARSER_PROTOTYPE(config_parse_bus_name); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_utmp_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_working_directory); +CONFIG_PARSER_PROTOTYPE(config_parse_fdname); +CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat); +CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat); +CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces); +CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems); +CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc); +CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset); +CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); +CONFIG_PARSER_PROTOTYPE(config_parse_log_namespace); +CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_pid_file); +CONFIG_PARSER_PROTOTYPE(config_parse_exit_status); +CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers); +CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity2); +CONFIG_PARSER_PROTOTYPE(config_parse_show_status); +CONFIG_PARSER_PROTOTYPE(config_parse_status_unit_format); +CONFIG_PARSER_PROTOTYPE(config_parse_output_restricted); +CONFIG_PARSER_PROTOTYPE(config_parse_crash_chvt); +CONFIG_PARSER_PROTOTYPE(config_parse_timeout_abort); +CONFIG_PARSER_PROTOTYPE(config_parse_swap_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_mount_images); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_timestamping); +CONFIG_PARSER_PROTOTYPE(config_parse_extension_images); +CONFIG_PARSER_PROTOTYPE(config_parse_bpf_foreign_program); +CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_socket_bind); +CONFIG_PARSER_PROTOTYPE(config_parse_restrict_network_interfaces); +CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_tty_size); +CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns); +CONFIG_PARSER_PROTOTYPE(config_parse_open_file); +CONFIG_PARSER_PROTOTYPE(config_parse_memory_pressure_watch); +CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_nft_set); + +/* gperf prototypes */ +const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); +extern const char load_fragment_gperf_nulstr[]; diff --git a/src/core/main.c b/src/core/main.c new file mode 100644 index 0000000..3f71cc0 --- /dev/null +++ b/src/core/main.c @@ -0,0 +1,3227 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +# include +#endif + +#include "sd-bus.h" +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "apparmor-setup.h" +#include "architecture.h" +#include "argv-util.h" +#if HAVE_LIBBPF +#include "bpf-lsm.h" +#endif +#include "build.h" +#include "bus-error.h" +#include "bus-util.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "chase.h" +#include "clock-util.h" +#include "conf-parser.h" +#include "confidential-virt.h" +#include "copy.h" +#include "cpu-set-util.h" +#include "crash-handler.h" +#include "dbus-manager.h" +#include "dbus.h" +#include "constants.h" +#include "dev-setup.h" +#include "efi-random.h" +#include "efivars.h" +#include "emergency-action.h" +#include "env-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fdset.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "getopt-defs.h" +#include "hexdecoct.h" +#include "hostname-setup.h" +#include "ima-setup.h" +#include "import-creds.h" +#include "initrd-util.h" +#include "killall.h" +#include "kmod-setup.h" +#include "limits-util.h" +#include "load-fragment.h" +#include "log.h" +#include "loopback-setup.h" +#include "machine-id-setup.h" +#include "main.h" +#include "manager.h" +#include "manager-dump.h" +#include "manager-serialize.h" +#include "mkdir-label.h" +#include "mount-setup.h" +#include "os-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "psi-util.h" +#include "random-util.h" +#include "rlimit-util.h" +#include "seccomp-util.h" +#include "selinux-setup.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "smack-setup.h" +#include "special.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "switch-root.h" +#include "sysctl-util.h" +#include "terminal-util.h" +#include "time-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "version.h" +#include "virt.h" +#include "watchdog.h" + +#if HAS_FEATURE_ADDRESS_SANITIZER +#include +#endif + +static enum { + ACTION_RUN, + ACTION_HELP, + ACTION_VERSION, + ACTION_TEST, + ACTION_DUMP_CONFIGURATION_ITEMS, + ACTION_DUMP_BUS_PROPERTIES, + ACTION_BUS_INTROSPECT, +} arg_action = ACTION_RUN; + +static const char *arg_bus_introspect = NULL; + +/* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real + * defaults are assigned in reset_arguments() below. */ +static char *arg_default_unit; +static RuntimeScope arg_runtime_scope; +bool arg_dump_core; +int arg_crash_chvt; +bool arg_crash_shell; +bool arg_crash_reboot; +static char *arg_confirm_spawn; +static ShowStatus arg_show_status; +static StatusUnitFormat arg_status_unit_format; +static bool arg_switched_root; +static PagerFlags arg_pager_flags; +static bool arg_service_watchdogs; +static UnitDefaults arg_defaults; +static usec_t arg_runtime_watchdog; +static usec_t arg_reboot_watchdog; +static usec_t arg_kexec_watchdog; +static usec_t arg_pretimeout_watchdog; +static char *arg_early_core_pattern; +static char *arg_watchdog_pretimeout_governor; +static char *arg_watchdog_device; +static char **arg_default_environment; +static char **arg_manager_environment; +static uint64_t arg_capability_bounding_set; +static bool arg_no_new_privs; +static nsec_t arg_timer_slack_nsec; +static Set* arg_syscall_archs; +static FILE* arg_serialization; +static sd_id128_t arg_machine_id; +static EmergencyAction arg_cad_burst_action; +static CPUSet arg_cpu_affinity; +static NUMAPolicy arg_numa_policy; +static usec_t arg_clock_usec; +static void *arg_random_seed; +static size_t arg_random_seed_size; +static usec_t arg_reload_limit_interval_sec; +static unsigned arg_reload_limit_burst; + +/* A copy of the original environment block */ +static char **saved_env = NULL; + +static int parse_configuration(const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock); + +static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) { + _cleanup_free_ char *base = NULL; + _cleanup_strv_free_ char **files = NULL, **dirs = NULL; + int r; + + r = xdg_user_config_dir(&base, "/systemd"); + if (r < 0) + return r; + + r = strv_extendf(&files, "%s/user.conf", base); + if (r < 0) + return r; + + r = strv_extend(&files, PKGSYSCONFDIR "/user.conf"); + if (r < 0) + return r; + + r = strv_consume(&dirs, TAKE_PTR(base)); + if (r < 0) + return r; + + r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false); + if (r < 0) + return r; + + *ret_files = TAKE_PTR(files); + *ret_dirs = TAKE_PTR(dirs); + return 0; +} + +static int console_setup(void) { + _cleanup_close_ int tty_fd = -EBADF; + unsigned rows, cols; + int r; + + tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (tty_fd < 0) + return log_error_errno(tty_fd, "Failed to open /dev/console: %m"); + + /* We don't want to force text mode. plymouth may be showing + * pictures already from initrd. */ + r = reset_terminal_fd(tty_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to reset /dev/console: %m"); + + r = proc_cmdline_tty_size("/dev/console", &rows, &cols); + if (r < 0) + log_warning_errno(r, "Failed to get terminal size, ignoring: %m"); + else { + r = terminal_set_size_fd(tty_fd, NULL, rows, cols); + if (r < 0) + log_warning_errno(r, "Failed to set terminal size, ignoring: %m"); + } + + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + int r; + + assert(key); + + if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value); + else if (in_initrd() == !!startswith(key, "rd.")) + return free_and_strdup_warn(&arg_default_unit, value); + + } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value); + else + arg_dump_core = r; + + } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (path_is_absolute(value)) + (void) parse_path_argument(value, false, &arg_early_core_pattern); + else + log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) { + + if (!value) + arg_crash_chvt = 0; /* turn on */ + else { + r = parse_crash_chvt(value, &arg_crash_chvt); + if (r < 0) + log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value); + } + + } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value); + else + arg_crash_shell = r; + + } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value); + else + arg_crash_reboot = r; + + } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) { + char *s; + + r = parse_confirm_spawn(value, &s); + if (r < 0) + log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value); + else + free_and_replace(arg_confirm_spawn, s); + + } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value); + else + arg_service_watchdogs = r; + + } else if (proc_cmdline_key_streq(key, "systemd.show_status")) { + + if (value) { + r = parse_show_status(value, &arg_show_status); + if (r < 0) + log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value); + } else + arg_show_status = SHOW_STATUS_YES; + + } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = status_unit_format_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value); + else + arg_status_unit_format = r; + + } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = exec_output_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value); + else + arg_defaults.std_output = r; + + } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = exec_output_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value); + else + arg_defaults.std_error = r; + + } else if (streq(key, "systemd.setenv")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (!env_assignment_is_valid(value)) + log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value); + else { + r = strv_env_replace_strdup(&arg_default_environment, value); + if (r < 0) + return log_oom(); + } + + } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = id128_from_string_nonzero(value, &arg_machine_id); + if (r < 0) + log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value); + + } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &arg_defaults.timeout_start_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value); + + if (arg_defaults.timeout_start_usec <= 0) + arg_defaults.timeout_start_usec = USEC_INFINITY; + + } else if (proc_cmdline_key_streq(key, "systemd.default_device_timeout_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &arg_defaults.device_timeout_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse default device timeout '%s', ignoring: %m", value); + + if (arg_defaults.device_timeout_usec <= 0) + arg_defaults.device_timeout_usec = USEC_INFINITY; + + } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_cpu_set(value, &arg_cpu_affinity); + if (r < 0) + log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value); + + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + (void) parse_path_argument(value, false, &arg_watchdog_device); + + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (streq(value, "default")) + arg_runtime_watchdog = USEC_INFINITY; + else if (streq(value, "off")) + arg_runtime_watchdog = 0; + else { + r = parse_sec(value, &arg_runtime_watchdog); + if (r < 0) { + log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value); + return 0; + } + } + + arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog; + + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (streq(value, "default")) + arg_pretimeout_watchdog = USEC_INFINITY; + else if (streq(value, "off")) + arg_pretimeout_watchdog = 0; + else { + r = parse_sec(value, &arg_pretimeout_watchdog); + if (r < 0) { + log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value); + return 0; + } + } + + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) { + + if (proc_cmdline_value_missing(key, value) || isempty(value)) { + arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor); + return 0; + } + + if (!string_is_safe(value)) { + log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value); + return 0; + } + + return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value); + + } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = safe_atou64(value, &arg_clock_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value); + + } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) { + void *p; + size_t sz; + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = unbase64mem(value, SIZE_MAX, &p, &sz); + if (r < 0) + log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value); + + free(arg_random_seed); + arg_random_seed = sz > 0 ? p : mfree(p); + arg_random_seed_size = sz; + + } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_interval_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &arg_reload_limit_interval_sec); + if (r < 0) { + log_warning_errno(r, "Failed to parse systemd.reload_limit_interval_sec= argument '%s', ignoring: %m", value); + return 0; + } + + } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_burst")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = safe_atou(value, &arg_reload_limit_burst); + if (r < 0) { + log_warning_errno(r, "Failed to parse systemd.reload_limit_burst= argument '%s', ignoring: %m", value); + return 0; + } + + } else if (streq(key, "quiet") && !value) { + + if (arg_show_status == _SHOW_STATUS_INVALID) + arg_show_status = SHOW_STATUS_ERROR; + + } else if (streq(key, "debug") && !value) { + + /* Note that log_parse_environment() handles 'debug' + * too, and sets the log level to LOG_DEBUG. */ + + if (detect_container() > 0) + log_set_target(LOG_TARGET_CONSOLE); + + } else if (!value) { + const char *target; + + /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */ + target = runlevel_to_target(key); + if (target) + return free_and_strdup_warn(&arg_default_unit, target); + } + + return 0; +} + +#define DEFINE_SETTER(name, func, descr) \ + static int name(const char *unit, \ + const char *filename, \ + unsigned line, \ + const char *section, \ + unsigned section_line, \ + const char *lvalue, \ + int ltype, \ + const char *rvalue, \ + void *data, \ + void *userdata) { \ + \ + int r; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + \ + r = func(rvalue); \ + if (r < 0) \ + log_syntax(unit, LOG_ERR, filename, line, r, \ + "Invalid " descr "'%s': %m", \ + rvalue); \ + \ + return 0; \ + } + +DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level"); +DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target"); +DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color"); +DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location"); +DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time"); + +static int config_parse_default_timeout_abort( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + int r; + + r = config_parse_timeout_abort( + unit, + filename, + line, + section, + section_line, + lvalue, + ltype, + rvalue, + &arg_defaults.timeout_abort_usec, + userdata); + if (r >= 0) + arg_defaults.timeout_abort_set = r; + return 0; +} + +static int config_parse_oom_score_adjust( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int oa, r; + + if (isempty(rvalue)) { + arg_defaults.oom_score_adjust_set = false; + return 0; + } + + r = parse_oom_score_adjust(rvalue, &oa); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue); + return 0; + } + + arg_defaults.oom_score_adjust = oa; + arg_defaults.oom_score_adjust_set = true; + + return 0; +} + +static int parse_config_file(void) { + const ConfigTableItem items[] = { + { "Manager", "LogLevel", config_parse_level2, 0, NULL }, + { "Manager", "LogTarget", config_parse_target, 0, NULL }, + { "Manager", "LogColor", config_parse_color, 0, NULL }, + { "Manager", "LogLocation", config_parse_location, 0, NULL }, + { "Manager", "LogTime", config_parse_time, 0, NULL }, + { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core }, + { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt }, + { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt }, + { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell }, + { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot }, + { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status }, + { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format }, + { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity }, + { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type }, + { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy }, + { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, + { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog }, + { "Manager", "RuntimeWatchdogPreSec", config_parse_watchdog_sec, 0, &arg_pretimeout_watchdog }, + { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, + { "Manager", "ShutdownWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */ + { "Manager", "KExecWatchdogSec", config_parse_watchdog_sec, 0, &arg_kexec_watchdog }, + { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device }, + { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor }, + { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set }, + { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs }, +#if HAVE_SECCOMP + { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs }, +#else + { "Manager", "SystemCallArchitectures", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, + +#endif + { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec }, + { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_defaults.timer_accuracy_usec }, + { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_defaults.std_output }, + { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_defaults.std_error }, + { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_defaults.timeout_start_usec }, + { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_defaults.timeout_stop_usec }, + { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL }, + { "Manager", "DefaultDeviceTimeoutSec", config_parse_sec, 0, &arg_defaults.device_timeout_usec }, + { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_defaults.restart_usec }, + { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_defaults.start_limit_interval}, /* obsolete alias */ + { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_defaults.start_limit_interval}, + { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_defaults.start_limit_burst }, + { "Manager", "DefaultEnvironment", config_parse_environ, arg_runtime_scope, &arg_default_environment }, + { "Manager", "ManagerEnvironment", config_parse_environ, arg_runtime_scope, &arg_manager_environment }, + { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_defaults.rlimit }, + { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_defaults.rlimit }, + { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_defaults.rlimit }, + { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_defaults.rlimit }, + { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_defaults.rlimit }, + { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_defaults.rlimit }, + { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_defaults.rlimit }, + { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_defaults.rlimit }, + { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_defaults.rlimit }, + { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_defaults.rlimit }, + { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_defaults.rlimit }, + { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_defaults.rlimit }, + { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_defaults.rlimit }, + { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_defaults.rlimit }, + { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_defaults.rlimit }, + { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_defaults.rlimit }, + { "Manager", "DefaultCPUAccounting", config_parse_bool, 0, &arg_defaults.cpu_accounting }, + { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_defaults.io_accounting }, + { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_defaults.ip_accounting }, + { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_defaults.blockio_accounting }, + { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_defaults.memory_accounting }, + { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_defaults.tasks_accounting }, + { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_defaults.tasks_max }, + { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_defaults.memory_pressure_threshold_usec }, + { "Manager", "DefaultMemoryPressureWatch", config_parse_memory_pressure_watch, 0, &arg_defaults.memory_pressure_watch }, + { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action }, + { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy }, + { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL }, + { "Manager", "ReloadLimitIntervalSec", config_parse_sec, 0, &arg_reload_limit_interval_sec }, + { "Manager", "ReloadLimitBurst", config_parse_unsigned, 0, &arg_reload_limit_burst }, +#if ENABLE_SMACK + { "Manager", "DefaultSmackProcessLabel", config_parse_string, 0, &arg_defaults.smack_process_label }, +#else + { "Manager", "DefaultSmackProcessLabel", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, +#endif + {} + }; + + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) + (void) config_parse_config_file("system.conf", + "Manager\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, + NULL); + else { + _cleanup_strv_free_ char **files = NULL, **dirs = NULL; + int r; + + assert(arg_runtime_scope == RUNTIME_SCOPE_USER); + + r = manager_find_user_config_paths(&files, &dirs); + if (r < 0) + return log_error_errno(r, "Failed to determine config file paths: %m"); + + (void) config_parse_many( + (const char* const*) files, + (const char* const*) dirs, + "user.conf.d", + /* root = */ NULL, + "Manager\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, + NULL, NULL, NULL); + } + + /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use + * USEC_INFINITY like everywhere else. */ + if (arg_defaults.timeout_start_usec <= 0) + arg_defaults.timeout_start_usec = USEC_INFINITY; + if (arg_defaults.timeout_stop_usec <= 0) + arg_defaults.timeout_stop_usec = USEC_INFINITY; + + return 0; +} + +static void set_manager_defaults(Manager *m) { + int r; + + assert(m); + + /* Propagates the various default unit property settings into the manager object, i.e. properties + * that do not affect the manager itself, but are just what newly allocated units will have set if + * they haven't set anything else. (Also see set_manager_settings() for the settings that affect the + * manager's own behaviour) */ + + r = manager_set_unit_defaults(m, &arg_defaults); + if (r < 0) + log_warning_errno(r, "Failed to set manager defaults, ignoring: %m"); + + r = manager_default_environment(m); + if (r < 0) + log_warning_errno(r, "Failed to set manager default environment, ignoring: %m"); + + r = manager_transient_environment_add(m, arg_default_environment); + if (r < 0) + log_warning_errno(r, "Failed to add to transient environment, ignoring: %m"); +} + +static void set_manager_settings(Manager *m) { + int r; + + assert(m); + + /* Propagates the various manager settings into the manager object, i.e. properties that + * effect the manager itself (as opposed to just being inherited into newly allocated + * units, see set_manager_defaults() above). */ + + m->confirm_spawn = arg_confirm_spawn; + m->service_watchdogs = arg_service_watchdogs; + m->cad_burst_action = arg_cad_burst_action; + /* Note that we don't do structured initialization here, otherwise it will reset the rate limit + * counter on every daemon-reload. */ + m->reload_ratelimit.interval = arg_reload_limit_interval_sec; + m->reload_ratelimit.burst = arg_reload_limit_burst; + + manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog); + manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog); + manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog); + manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog); + r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor); + if (r < 0) + log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor); + + manager_set_show_status(m, arg_show_status, "command line"); + m->status_unit_format = arg_status_unit_format; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + COMMON_GETOPT_ARGS, + SYSTEMD_GETOPT_ARGS, + }; + + static const struct option options[] = { + COMMON_GETOPT_OPTIONS, + SYSTEMD_GETOPT_OPTIONS, + {} + }; + + int c, r; + bool user_arg_seen = false; + + assert(argc >= 1); + assert(argv); + + if (getpid_cached() == 1) + opterr = 0; + + while ((c = getopt_long(argc, argv, SYSTEMD_GETOPT_SHORT_OPTIONS, options, NULL)) >= 0) + + switch (c) { + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg); + + break; + + case ARG_LOG_COLOR: + + if (optarg) { + r = log_show_color_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log color setting \"%s\": %m", + optarg); + } else + log_show_color(true); + + break; + + case ARG_LOG_LOCATION: + if (optarg) { + r = log_show_location_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log location setting \"%s\": %m", + optarg); + } else + log_show_location(true); + + break; + + case ARG_LOG_TIME: + + if (optarg) { + r = log_show_time_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log time setting \"%s\": %m", + optarg); + } else + log_show_time(true); + + break; + + case ARG_DEFAULT_STD_OUTPUT: + r = exec_output_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m", + optarg); + arg_defaults.std_output = r; + break; + + case ARG_DEFAULT_STD_ERROR: + r = exec_output_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m", + optarg); + arg_defaults.std_error = r; + break; + + case ARG_UNIT: + r = free_and_strdup(&arg_default_unit, optarg); + if (r < 0) + return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg); + + break; + + case ARG_SYSTEM: + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + break; + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + user_arg_seen = true; + break; + + case ARG_TEST: + arg_action = ACTION_TEST; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_VERSION: + arg_action = ACTION_VERSION; + break; + + case ARG_DUMP_CONFIGURATION_ITEMS: + arg_action = ACTION_DUMP_CONFIGURATION_ITEMS; + break; + + case ARG_DUMP_BUS_PROPERTIES: + arg_action = ACTION_DUMP_BUS_PROPERTIES; + break; + + case ARG_BUS_INTROSPECT: + arg_bus_introspect = optarg; + arg_action = ACTION_BUS_INTROSPECT; + break; + + case ARG_DUMP_CORE: + r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core); + if (r < 0) + return r; + break; + + case ARG_CRASH_CHVT: + r = parse_crash_chvt(optarg, &arg_crash_chvt); + if (r < 0) + return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m", + optarg); + break; + + case ARG_CRASH_SHELL: + r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell); + if (r < 0) + return r; + break; + + case ARG_CRASH_REBOOT: + r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot); + if (r < 0) + return r; + break; + + case ARG_CONFIRM_SPAWN: + arg_confirm_spawn = mfree(arg_confirm_spawn); + + r = parse_confirm_spawn(optarg, &arg_confirm_spawn); + if (r < 0) + return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m", + optarg); + break; + + case ARG_SERVICE_WATCHDOGS: + r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs); + if (r < 0) + return r; + break; + + case ARG_SHOW_STATUS: + if (optarg) { + r = parse_show_status(optarg, &arg_show_status); + if (r < 0) + return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m", + optarg); + } else + arg_show_status = SHOW_STATUS_YES; + break; + + case ARG_DESERIALIZE: { + int fd; + FILE *f; + + fd = parse_fd(optarg); + if (fd < 0) + return log_error_errno(fd, "Failed to parse serialization fd \"%s\": %m", optarg); + + (void) fd_cloexec(fd, true); + + f = fdopen(fd, "r"); + if (!f) + return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd); + + safe_fclose(arg_serialization); + arg_serialization = f; + + break; + } + + case ARG_SWITCHED_ROOT: + arg_switched_root = true; + break; + + case ARG_MACHINE_ID: + r = id128_from_string_nonzero(optarg, &arg_machine_id); + if (r < 0) + return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg); + break; + + case 'h': + arg_action = ACTION_HELP; + break; + + case 'D': + log_set_max_level(LOG_DEBUG); + break; + + case 'b': + case 's': + case 'z': + /* Just to eat away the sysvinit kernel cmdline args that we'll parse in + * parse_proc_cmdline_item() or ignore, without any getopt() error messages. + */ + case '?': + if (getpid_cached() != 1) + return -EINVAL; + else + return 0; + + default: + assert_not_reached(); + } + + if (optind < argc && getpid_cached() != 1) + /* Hmm, when we aren't run as init system let's complain about excess arguments */ + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments."); + + if (arg_action == ACTION_RUN && arg_runtime_scope == RUNTIME_SCOPE_USER && !user_arg_seen) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Explicit --user argument required to run as user manager."); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "%sStarts and monitors system and user services.%s\n\n" + "This program takes no positional arguments.\n\n" + "%sOptions%s:\n" + " -h --help Show this help\n" + " --version Show version\n" + " --test Determine initial transaction, dump it and exit\n" + " --system Combined with --test: operate in system mode\n" + " --user Combined with --test: operate in user mode\n" + " --dump-configuration-items Dump understood unit configuration items\n" + " --dump-bus-properties Dump exposed bus properties\n" + " --bus-introspect=PATH Write XML introspection data\n" + " --unit=UNIT Set default unit\n" + " --dump-core[=BOOL] Dump core on crash\n" + " --crash-vt=NR Change to specified VT on crash\n" + " --crash-reboot[=BOOL] Reboot on crash\n" + " --crash-shell[=BOOL] Run shell on crash\n" + " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n" + " --show-status[=BOOL] Show status updates on the console during boot\n" + " --log-target=TARGET Set log target (console, journal, kmsg,\n" + " journal-or-kmsg, null)\n" + " --log-level=LEVEL Set log level (debug, info, notice, warning,\n" + " err, crit, alert, emerg)\n" + " --log-color[=BOOL] Highlight important log messages\n" + " --log-location[=BOOL] Include code location in log messages\n" + " --log-time[=BOOL] Prefix log messages with current time\n" + " --default-standard-output= Set default standard output for services\n" + " --default-standard-error= Set default standard error output for services\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + ansi_underline(), + ansi_normal(), + link); + + return 0; +} + +static int prepare_reexecute( + Manager *m, + FILE **ret_f, + FDSet **ret_fds, + bool switching_root) { + + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + assert(ret_f); + assert(ret_fds); + + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + + /* Make sure nothing is really destructed when we shut down */ + m->n_reloading++; + bus_manager_send_reloading(m, true); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + r = manager_serialize(m, f, fds, switching_root); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to rewind serialization fd: %m"); + + r = fd_cloexec(fileno(f), false); + if (r < 0) + return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m"); + + r = fdset_cloexec(fds, false); + if (r < 0) + return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m"); + + *ret_f = TAKE_PTR(f); + *ret_fds = TAKE_PTR(fds); + + return 0; +} + +static void bump_file_max_and_nr_open(void) { + + /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large + * numbers of file descriptors are no longer a performance problem and their memory is properly + * tracked by memcg, thus counting them and limiting them in another two layers of limits is + * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits + * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */ + +#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN + int r; +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX + /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were + * different, but the operation would fail silently.) */ + r = sysctl_write("fs/file-max", LONG_MAX_STR); + if (r < 0) + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to bump fs.file-max, ignoring: %m"); +#endif + +#if BUMP_PROC_SYS_FS_NR_OPEN + int v = INT_MAX; + + /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know + * what they are. The expression by which the maximum is determined is dependent on the architecture, + * and is something we don't really want to copy to userspace, as it is dependent on implementation + * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try + * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that + * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */ + + for (;;) { + int k; + + v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */ + if (v < 1024) { + log_warning("Can't bump fs.nr_open, value too small."); + break; + } + + k = read_nr_open(); + if (k < 0) { + log_error_errno(k, "Failed to read fs.nr_open: %m"); + break; + } + if (k >= v) { /* Already larger */ + log_debug("Skipping bump, value is already larger."); + break; + } + + r = sysctl_writef("fs/nr_open", "%i", v); + if (r == -EINVAL) { + log_debug("Couldn't write fs.nr_open as %i, halving it.", v); + v /= 2; + continue; + } + if (r < 0) { + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m"); + break; + } + + log_debug("Successfully bumped fs.nr_open to %i", v); + break; + } +#endif +} + +static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) { + struct rlimit new_rlimit; + int r, nr; + + /* Get the underlying absolute limit the kernel enforces */ + nr = read_nr_open(); + + /* Calculate the new limits to use for us. Never lower from what we inherited. */ + new_rlimit = (struct rlimit) { + .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur), + .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max), + }; + + /* Shortcut if nothing changes. */ + if (saved_rlimit->rlim_max >= new_rlimit.rlim_max && + saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) { + log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping."); + return 0; + } + + /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for + * both hard and soft. */ + r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m"); + + return 0; +} + +static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) { + struct rlimit new_rlimit; + uint64_t mm; + int r; + + /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK + * which should normally disable such checks. We need them to implement IPAddressAllow= and + * IPAddressDeny=, hence let's bump the value high enough for our user. */ + + /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t + * must be unsigned, hence this is a given, but let's make this clear here. */ + assert_cc(RLIM_INFINITY > 0); + + mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of + * physical RAM. We allow an eighth to be locked by us, just to + * pick a value. */ + + new_rlimit = (struct rlimit) { + .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm), + .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm), + }; + + if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur && + saved_rlimit->rlim_cur >= new_rlimit.rlim_max) { + log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping."); + return 0; + } + + r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m"); + + return 0; +} + +static void test_usr(void) { + + /* Check that /usr is either on the same file system as / or mounted already. */ + + if (dir_is_empty("/usr", /* ignore_hidden_or_backup= */ false) <= 0) + return; + + log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. " + "Some things will probably break (sometimes even silently) in mysterious ways. " + "Consult https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information."); +} + +static int enforce_syscall_archs(Set *archs) { +#if HAVE_SECCOMP + int r; + + if (!is_seccomp_available()) + return 0; + + r = seccomp_restrict_archs(arg_syscall_archs); + if (r < 0) + return log_error_errno(r, "Failed to enforce system call architecture restrication: %m"); +#endif + return 0; +} + +static int os_release_status(void) { + _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL, + *ansi_color = NULL, *support_end = NULL; + int r; + + r = parse_os_release(NULL, + "PRETTY_NAME", &pretty_name, + "NAME", &name, + "VERSION", &version, + "ANSI_COLOR", &ansi_color, + "SUPPORT_END", &support_end); + if (r < 0) + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to read os-release file, ignoring: %m"); + + const char *label = os_release_pretty_name(pretty_name, name); + + if (show_status_on(arg_show_status)) { + if (log_get_show_color()) + status_printf(NULL, 0, + "\nWelcome to \x1B[%sm%s\x1B[0m!\n", + empty_to_null(ansi_color) ?: "1", + label); + else + status_printf(NULL, 0, + "\nWelcome to %s!\n", + label); + } + + if (support_end && os_release_support_ended(support_end, /* quiet */ false, NULL) > 0) + /* pretty_name may include the version already, so we'll print the version only if we + * have it and we're not using pretty_name. */ + status_printf(ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, 0, + "This OS version (%s%s%s) is past its end-of-support date (%s)", + label, + (pretty_name || !version) ? "" : " version ", + (pretty_name || !version) ? "" : version, + support_end); + + return 0; +} + +static int setup_os_release(RuntimeScope scope) { + _cleanup_free_ char *os_release_dst = NULL; + const char *os_release_src = "/etc/os-release"; + int r; + + if (access("/etc/os-release", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if /etc/os-release exists, ignoring: %m"); + + os_release_src = "/usr/lib/os-release"; + } + + if (scope == RUNTIME_SCOPE_SYSTEM) { + os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release"); + if (!os_release_dst) + return log_oom_debug(); + } else { + if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0) + return log_oom_debug(); + } + + r = mkdir_parents_label(os_release_dst, 0755); + if (r < 0) + return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst); + + r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE); + if (r < 0) + return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst); + + return 0; +} + +static int write_container_id(void) { + const char *c; + int r = 0; /* avoid false maybe-uninitialized warning */ + + c = getenv("container"); + if (isempty(c)) + return 0; + + WITH_UMASK(0022) + r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE); + if (r < 0) + return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m"); + + return 1; +} + +static int bump_unix_max_dgram_qlen(void) { + _cleanup_free_ char *qlen = NULL; + unsigned long v; + int r; + + /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set + * the value really really early during boot, so that it is actually applied to all our sockets, + * including the $NOTIFY_SOCKET one. */ + + r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen); + if (r < 0) + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to read AF_UNIX datagram queue length, ignoring: %m"); + + r = safe_atolu(qlen, &v); + if (r < 0) + return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen); + + if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN) + return 0; + + r = sysctl_write("net/unix/max_dgram_qlen", STRINGIFY(DEFAULT_UNIX_MAX_DGRAM_QLEN)); + if (r < 0) + return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to bump AF_UNIX datagram queue length, ignoring: %m"); + + return 1; +} + +static int fixup_environment(void) { + _cleanup_free_ char *term = NULL; + const char *t; + int r; + + /* Only fix up the environment when we are started as PID 1 */ + if (getpid_cached() != 1) + return 0; + + /* We expect the environment to be set correctly if run inside a container. */ + if (detect_container() > 0) + return 0; + + /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the + * backend device used by the console. We try to make a better guess here since some consoles might + * not have support for color mode for example. + * + * However if TERM was configured through the kernel command line then leave it alone. */ + r = proc_cmdline_get_key("TERM", 0, &term); + if (r < 0) + return r; + + if (r == 0) { + r = proc_cmdline_get_key("systemd.tty.term.console", 0, &term); + if (r < 0) + return r; + } + + t = term ?: default_term_for_tty("/dev/console"); + + if (setenv("TERM", t, 1) < 0) + return -errno; + + /* The kernels sets HOME=/ for init. Let's undo this. */ + if (path_equal_ptr(getenv("HOME"), "/")) + assert_se(unsetenv("HOME") == 0); + + return 0; +} + +static void redirect_telinit(int argc, char *argv[]) { + + /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */ + +#if HAVE_SYSV_COMPAT + if (getpid_cached() == 1) + return; + + if (!invoked_as(argv, "init")) + return; + + execv(SYSTEMCTL_BINARY_PATH, argv); + log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m"); + exit(EXIT_FAILURE); +#endif +} + +static int become_shutdown(int objective, int retval) { + static const char* const table[_MANAGER_OBJECTIVE_MAX] = { + [MANAGER_EXIT] = "exit", + [MANAGER_REBOOT] = "reboot", + [MANAGER_POWEROFF] = "poweroff", + [MANAGER_HALT] = "halt", + [MANAGER_KEXEC] = "kexec", + }; + + char log_level[STRLEN("--log-level=") + DECIMAL_STR_MAX(int)], + timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")], + exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)]; + + _cleanup_strv_free_ char **env_block = NULL; + usec_t watchdog_timer = 0; + int r; + + assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX); + assert(table[objective]); + + xsprintf(log_level, "--log-level=%d", log_get_max_level()); + xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec); + + const char* command_line[10] = { + SYSTEMD_SHUTDOWN_BINARY_PATH, + table[objective], + log_level, + timeout, + /* Note that the last position is a terminator and must contain NULL. */ + }; + size_t pos = 4; + + assert(command_line[pos-1]); + assert(!command_line[pos]); + + switch (log_get_target()) { + + case LOG_TARGET_KMSG: + case LOG_TARGET_JOURNAL_OR_KMSG: + case LOG_TARGET_SYSLOG_OR_KMSG: + command_line[pos++] = "--log-target=kmsg"; + break; + + case LOG_TARGET_NULL: + command_line[pos++] = "--log-target=null"; + break; + + case LOG_TARGET_CONSOLE: + default: + command_line[pos++] = "--log-target=console"; + break; + }; + + if (log_get_show_color()) + command_line[pos++] = "--log-color"; + + if (log_get_show_location()) + command_line[pos++] = "--log-location"; + + if (log_get_show_time()) + command_line[pos++] = "--log-time"; + + xsprintf(exit_code, "--exit-code=%d", retval); + command_line[pos++] = exit_code; + + assert(pos < ELEMENTSOF(command_line)); + + /* The watchdog: */ + + if (objective == MANAGER_REBOOT) + watchdog_timer = arg_reboot_watchdog; + else if (objective == MANAGER_KEXEC) + watchdog_timer = arg_kexec_watchdog; + + /* If we reboot or kexec let's set the shutdown watchdog and tell the + * shutdown binary to repeatedly ping it. + * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */ + (void) watchdog_setup_pretimeout(0); + (void) watchdog_setup_pretimeout_governor(NULL); + r = watchdog_setup(watchdog_timer); + watchdog_close(r < 0); + + /* The environment block: */ + + env_block = strv_copy(environ); + + /* Tell the binary how often to ping, ignore failure */ + (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer); + + if (arg_watchdog_device) + (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device); + + /* Avoid the creation of new processes forked by the kernel; at this + * point, we will not listen to the signals anyway */ + if (detect_container() <= 0) + (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER); + + execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block); + return -errno; +} + +static void initialize_clock(void) { + int r; + + /* This is called very early on, before we parse the kernel command line or otherwise figure out why + * we are running, but only once. */ + + if (clock_is_localtime(NULL) > 0) { + int min; + + /* The very first call of settimeofday() also does a time warp in the kernel. + * + * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to + * take care of maintaining the RTC and do all adjustments. This matches the behavior of + * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC. + */ + r = clock_set_timezone(&min); + if (r < 0) + log_error_errno(r, "Failed to apply local time delta, ignoring: %m"); + else + log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min); + + } else if (!in_initrd()) + /* + * Do a dummy very first call to seal the kernel's time warp magic. + * + * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with + * LOCAL, but the real system could be set up that way. In such case, we need to delay the + * time-warp or the sealing until we reach the real system. + * + * Do no set the kernel's timezone. The concept of local time cannot be supported reliably, + * the time will jump or be incorrect at every daylight saving time change. All kernel local + * time concepts will be treated as UTC that way. + */ + (void) clock_reset_timewarp(); + + ClockChangeDirection change_dir; + r = clock_apply_epoch(&change_dir); + if (r > 0 && change_dir == CLOCK_CHANGE_FORWARD) + log_info("System time before build time, advancing clock."); + else if (r > 0 && change_dir == CLOCK_CHANGE_BACKWARD) + log_info("System time is further ahead than %s after build time, resetting clock to build time.", + FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY)); + else if (r < 0 && change_dir == CLOCK_CHANGE_FORWARD) + log_error_errno(r, "Current system time is before build time, but cannot correct: %m"); + else if (r < 0 && change_dir == CLOCK_CHANGE_BACKWARD) + log_error_errno(r, "Current system time is further ahead %s after build time, but cannot correct: %m", + FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY)); +} + +static void apply_clock_update(void) { + /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel + * command line and such. */ + + if (arg_clock_usec == 0) + return; + + if (getpid_cached() != 1) + return; + + if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0) + log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m"); + else + log_info("Set system clock to %s, as specified on the kernel command line.", + FORMAT_TIMESTAMP(arg_clock_usec)); +} + +static void cmdline_take_random_seed(void) { + size_t suggested; + int r; + + if (arg_random_seed_size == 0) + return; + + if (getpid_cached() != 1) + return; + + assert(arg_random_seed); + suggested = random_pool_size(); + + if (arg_random_seed_size < suggested) + log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.", + arg_random_seed_size, suggested); + + r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true); + if (r < 0) { + log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m"); + return; + } + + log_notice("Successfully credited entropy passed on kernel command line.\n" + "Note that the seed provided this way is accessible to unprivileged programs. " + "This functionality should not be used outside of testing environments."); +} + +static void initialize_coredump(bool skip_setup) { + if (getpid_cached() != 1) + return; + + /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour + * the limit) will process core dumps for system services by default. */ + if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0) + log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m"); + + /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored + * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel + * command line later so core dumps can still be generated during early startup and in initrd. */ + if (!skip_setup) + disable_coredumps(); +} + +static void initialize_core_pattern(bool skip_setup) { + int r; + + if (skip_setup || !arg_early_core_pattern) + return; + + if (getpid_cached() != 1) + return; + + r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", + arg_early_core_pattern); +} + +static void update_cpu_affinity(bool skip_setup) { + _cleanup_free_ char *mask = NULL; + + if (skip_setup || !arg_cpu_affinity.set) + return; + + assert(arg_cpu_affinity.allocated > 0); + + mask = cpu_set_to_range_string(&arg_cpu_affinity); + log_debug("Setting CPU affinity to {%s}.", strnull(mask)); + + if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0) + log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m"); +} + +static void update_numa_policy(bool skip_setup) { + int r; + _cleanup_free_ char *nodes = NULL; + const char * policy = NULL; + + if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy))) + return; + + if (DEBUG_LOGGING) { + policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy)); + nodes = cpu_set_to_range_string(&arg_numa_policy.nodes); + log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes)); + } + + r = apply_numa_policy(&arg_numa_policy); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "NUMA support not available, ignoring."); + else if (r < 0) + log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m"); +} + +static void filter_args( + const char* dst[], + size_t *dst_index, + char **src, + int argc) { + + assert(dst); + assert(dst_index); + + /* Copy some filtered arguments into the dst array from src. */ + for (int i = 1; i < argc; i++) { + if (STR_IN_SET(src[i], + "--switched-root", + "--system", + "--user")) + continue; + + if (startswith(src[i], "--deserialize=")) + continue; + if (streq(src[i], "--deserialize")) { + i++; /* Skip the argument too */ + continue; + } + + /* Skip target unit designators. We already acted upon this information and have queued + * appropriate jobs. We don't want to redo all this after reexecution. */ + if (startswith(src[i], "--unit=")) + continue; + if (streq(src[i], "--unit")) { + i++; /* Skip the argument too */ + continue; + } + + /* Seems we have a good old option. Let's pass it over to the new instance. */ + dst[(*dst_index)++] = src[i]; + } +} + +static void finish_remaining_processes(ManagerObjective objective) { + assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX); + + /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the + * SIGCHLD for them after deserializing. */ + if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT)) + broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec); + + /* On soft reboot really make sure nothing is left. Note that this will skip cgroups + * of units that were configured with SurviveFinalKillSignal=yes. */ + if (objective == MANAGER_SOFT_REBOOT) + broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec); +} + +static int do_reexecute( + ManagerObjective objective, + int argc, + char* argv[], + const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock, + FDSet *fds, + const char *switch_root_dir, + const char *switch_root_init, + const char **ret_error_message) { + + size_t i, args_size; + const char **args; + int r; + + assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT)); + assert(argc >= 0); + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + assert(ret_error_message); + + if (switch_root_init) { + r = chase(switch_root_init, switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL); + if (r < 0) + log_warning_errno(r, "Failed to chase configured init %s/%s: %m", + strempty(switch_root_dir), switch_root_init); + } else { + r = chase(SYSTEMD_BINARY_PATH, switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed to chase our own binary %s/%s: %m", + strempty(switch_root_dir), SYSTEMD_BINARY_PATH); + } + + if (r < 0) { + r = chase("/sbin/init", switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to chase %s/sbin/init", strempty(switch_root_dir)); + } + + /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get + * rebooted while we do that */ + watchdog_close(true); + + /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass + * the kernel default to its child processes */ + if (saved_rlimit_nofile->rlim_cur != 0) + (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile); + if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY) + (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock); + + finish_remaining_processes(objective); + + if (!switch_root_dir && objective == MANAGER_SOFT_REBOOT) { + /* If no switch root dir is specified, then check if /run/nextroot/ qualifies and use that */ + r = path_is_os_tree("/run/nextroot"); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to determine if /run/nextroot/ is a valid OS tree, ignoring: %m"); + else if (r > 0) + switch_root_dir = "/run/nextroot"; + } + + if (switch_root_dir) { + r = switch_root(/* new_root= */ switch_root_dir, + /* old_root_after= */ NULL, + /* flags= */ (objective == MANAGER_SWITCH_ROOT ? SWITCH_ROOT_DESTROY_OLD_ROOT : 0) | + (objective == MANAGER_SOFT_REBOOT ? 0 : SWITCH_ROOT_RECURSIVE_RUN)); + if (r < 0) + log_error_errno(r, "Failed to switch root, trying to continue: %m"); + } + + args_size = argc + 5; + args = newa(const char*, args_size); + + if (!switch_root_init) { + char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)]; + + /* First try to spawn ourselves with the right path, and with full serialization. We do this + * only if the user didn't specify an explicit init to spawn. */ + + assert(arg_serialization); + assert(fds); + + xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization)); + + i = 1; /* Leave args[0] empty for now. */ + + /* Put our stuff first to make sure it always gets parsed in case + * we get weird stuff from the kernel cmdline (like --) */ + if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT)) + args[i++] = "--switched-root"; + args[i++] = runtime_scope_cmdline_option_to_string(arg_runtime_scope); + args[i++] = sfd; + + filter_args(args, &i, argv, argc); + + args[i++] = NULL; + + assert(i <= args_size); + + /* + * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do + * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a + * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary, + * and wait() for it in the parent, before proceeding into the exec(). + */ + valgrind_summary_hack(); + + args[0] = SYSTEMD_BINARY_PATH; + (void) execv(args[0], (char* const*) args); + + if (objective == MANAGER_REEXECUTE) { + *ret_error_message = "Failed to execute our own binary"; + return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]); + } + + log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]); + } + + /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and + * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[], + * but let's hope that doesn't matter.) */ + + arg_serialization = safe_fclose(arg_serialization); + fds = fdset_free(fds); + + /* Reopen the console */ + (void) make_console_stdio(); + + i = 1; /* Leave args[0] empty for now. */ + for (int j = 1; j <= argc; j++) + args[i++] = argv[j]; + assert(i <= args_size); + + /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */ + (void) reset_all_signal_handlers(); + (void) reset_signal_mask(); + (void) rlimit_nofile_safe(); + + if (switch_root_init) { + args[0] = switch_root_init; + (void) execve(args[0], (char* const*) args, saved_env); + log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]); + } + + args[0] = "/sbin/init"; + (void) execv(args[0], (char* const*) args); + r = -errno; + + manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, + "Failed to execute /sbin/init"); + + *ret_error_message = "Failed to execute fallback shell"; + if (r == -ENOENT) { + log_warning("No /sbin/init, trying fallback"); + + args[0] = "/bin/sh"; + args[1] = NULL; + (void) execve(args[0], (char* const*) args, saved_env); + return log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m"); + } else + return log_error_errno(r, "Failed to execute /sbin/init, giving up: %m"); +} + +static int invoke_main_loop( + Manager *m, + const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock, + int *ret_retval, /* Return parameters relevant for shutting down */ + FDSet **ret_fds, /* Return parameters for reexecuting */ + char **ret_switch_root_dir, /* … */ + char **ret_switch_root_init, /* … */ + const char **ret_error_message) { + + int r; + + assert(m); + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + assert(ret_retval); + assert(ret_fds); + assert(ret_switch_root_dir); + assert(ret_switch_root_init); + assert(ret_error_message); + + for (;;) { + int objective = manager_loop(m); + if (objective < 0) { + *ret_error_message = "Failed to run main loop"; + return log_struct_errno(LOG_EMERG, objective, + LOG_MESSAGE("Failed to run main loop: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_MAINLOOP_FAILED_STR); + } + + switch (objective) { + + case MANAGER_RELOAD: { + LogTarget saved_log_target; + int saved_log_level; + + manager_send_reloading(m); + + log_info("Reloading..."); + + /* First, save any overridden log level/target, then parse the configuration file, + * which might change the log level to new settings. */ + + saved_log_level = m->log_level_overridden ? log_get_max_level() : -1; + saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID; + + (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock); + + set_manager_defaults(m); + set_manager_settings(m); + + update_cpu_affinity(false); + update_numa_policy(false); + + if (saved_log_level >= 0) + manager_override_log_level(m, saved_log_level); + if (saved_log_target >= 0) + manager_override_log_target(m, saved_log_target); + + if (manager_reload(m) < 0) + /* Reloading failed before the point of no return. + * Let's continue running as if nothing happened. */ + m->objective = MANAGER_OK; + else + log_info("Reloading finished in " USEC_FMT " ms.", + usec_sub_unsigned(now(CLOCK_MONOTONIC), m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].monotonic) / USEC_PER_MSEC); + + continue; + } + + case MANAGER_REEXECUTE: + + manager_send_reloading(m); /* From the perspective of the manager calling us this is + * pretty much the same as a reload */ + + r = prepare_reexecute(m, &arg_serialization, ret_fds, false); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + + log_notice("Reexecuting."); + + *ret_retval = EXIT_SUCCESS; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return objective; + + case MANAGER_SWITCH_ROOT: + + manager_send_reloading(m); /* From the perspective of the manager calling us this is + * pretty much the same as a reload */ + + manager_set_switching_root(m, true); + + if (!m->switch_root_init) { + r = prepare_reexecute(m, &arg_serialization, ret_fds, true); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + } else + *ret_fds = NULL; + + log_notice("Switching root."); + + *ret_retval = EXIT_SUCCESS; + + /* Steal the switch root parameters */ + *ret_switch_root_dir = TAKE_PTR(m->switch_root); + *ret_switch_root_init = TAKE_PTR(m->switch_root_init); + + return objective; + + case MANAGER_SOFT_REBOOT: + manager_send_reloading(m); + manager_set_switching_root(m, true); + + r = prepare_reexecute(m, &arg_serialization, ret_fds, /* switching_root= */ true); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + + log_notice("Soft-rebooting."); + + *ret_retval = EXIT_SUCCESS; + *ret_switch_root_dir = TAKE_PTR(m->switch_root); + *ret_switch_root_init = NULL; + + return objective; + + case MANAGER_EXIT: + if (MANAGER_IS_USER(m)) { + log_debug("Exit."); + + *ret_retval = m->return_value; + *ret_fds = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return objective; + } + + _fallthrough_; + case MANAGER_REBOOT: + case MANAGER_POWEROFF: + case MANAGER_HALT: + case MANAGER_KEXEC: { + log_notice("Shutting down."); + + *ret_retval = m->return_value; + *ret_fds = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return objective; + } + + default: + assert_not_reached(); + } + } +} + +static void log_execution_mode(bool *ret_first_boot) { + bool first_boot = false; + int r; + + assert(ret_first_boot); + + switch (arg_runtime_scope) { + + case RUNTIME_SCOPE_SYSTEM: { + struct utsname uts; + int v; + + log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)", + arg_action == ACTION_TEST ? "test " : "", + systemd_features); + + v = detect_virtualization(); + if (v > 0) + log_info("Detected virtualization %s.", virtualization_to_string(v)); + + v = detect_confidential_virtualization(); + if (v > 0) + log_info("Detected confidential virtualization %s.", confidential_virtualization_to_string(v)); + + log_info("Detected architecture %s.", architecture_to_string(uname_architecture())); + + if (in_initrd()) + log_info("Running in initrd."); + else { + _cleanup_free_ char *id_text = NULL; + + /* Let's check whether we are in first boot. First, check if an override was + * specified on the kernel command line. If yes, we honour that. */ + + r = proc_cmdline_get_bool("systemd.condition-first-boot", /* flags = */ 0, &first_boot); + if (r < 0) + log_debug_errno(r, "Failed to parse systemd.condition-first-boot= kernel command line argument, ignoring: %m"); + + if (r > 0) + log_full(first_boot ? LOG_INFO : LOG_DEBUG, + "Kernel command line argument says we are %s first boot.", + first_boot ? "in" : "not in"); + else { + /* Second, perform autodetection. We use /etc/machine-id as flag file for + * this: If it is missing or contains the value "uninitialized", this is the + * first boot. In other cases, it is not. This allows container managers and + * installers to provision a couple of files in /etc but still permit the + * first-boot initialization to occur. If the container manager wants to + * provision the machine ID it should pass $container_uuid to PID 1. */ + + r = read_one_line_file("/etc/machine-id", &id_text); + if (r < 0 || streq(id_text, "uninitialized")) { + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m"); + + first_boot = true; + log_info("Detected first boot."); + } else + log_debug("Detected initialized system, this is not the first boot."); + } + } + + assert_se(uname(&uts) >= 0); + + if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0) + log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. " + "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION); + else + log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION); + + break; + } + + case RUNTIME_SCOPE_USER: + if (DEBUG_LOGGING) { + _cleanup_free_ char *t = NULL; + + t = uid_to_name(getuid()); + log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)", + arg_action == ACTION_TEST ? " test" : "", + getuid(), strna(t), systemd_features); + } + + break; + + default: + assert_not_reached(); + } + + *ret_first_boot = first_boot; +} + +static int initialize_runtime( + bool skip_setup, + bool first_boot, + struct rlimit *saved_rlimit_nofile, + struct rlimit *saved_rlimit_memlock, + const char **ret_error_message) { + int r; + + assert(ret_error_message); + + /* Sets up various runtime parameters. Many of these initializations are conditionalized: + * + * - Some only apply to --system instances + * - Some only apply to --user instances + * - Some only apply when we first start up, but not when we reexecute + */ + + if (arg_action != ACTION_RUN) + return 0; + + update_cpu_affinity(skip_setup); + update_numa_policy(skip_setup); + + switch (arg_runtime_scope) { + + case RUNTIME_SCOPE_SYSTEM: + /* Make sure we leave a core dump without panicking the kernel. */ + install_crash_handler(); + + if (!skip_setup) { + r = mount_cgroup_controllers(); + if (r < 0) { + *ret_error_message = "Failed to mount cgroup hierarchies"; + return r; + } + + /* Pull credentials from various sources into a common credential directory (we do + * this here, before setting up the machine ID, so that we can use credential info + * for setting up the machine ID) */ + (void) import_credentials(); + + (void) os_release_status(); + (void) hostname_setup(true); + /* Force transient machine-id on first boot. */ + machine_id_setup(/* root= */ NULL, /* force_transient= */ first_boot, arg_machine_id, /* ret_machine_id */ NULL); + (void) loopback_setup(); + bump_unix_max_dgram_qlen(); + bump_file_max_and_nr_open(); + test_usr(); + write_container_id(); + + /* Copy os-release to the propagate directory, so that we update it for services running + * under RootDirectory=/RootImage= when we do a soft reboot. */ + r = setup_os_release(RUNTIME_SCOPE_SYSTEM); + if (r < 0) + log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m"); + } + + r = watchdog_set_device(arg_watchdog_device); + if (r < 0) + log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device); + + break; + + case RUNTIME_SCOPE_USER: { + _cleanup_free_ char *p = NULL; + + /* Create the runtime directory and place the inaccessible device nodes there, if we run in + * user mode. In system mode mount_setup() already did that. */ + + r = xdg_user_runtime_dir(&p, "/systemd"); + if (r < 0) { + *ret_error_message = "$XDG_RUNTIME_DIR is not set"; + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to determine $XDG_RUNTIME_DIR path: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR); + } + + (void) mkdir_p_label(p, 0755); + (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID); + r = setup_os_release(RUNTIME_SCOPE_USER); + if (r < 0) + log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m"); + break; + } + + default: + assert_not_reached(); + } + + if (arg_timer_slack_nsec != NSEC_INFINITY) + if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0) + log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m"); + + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) { + + if (!cap_test_all(arg_capability_bounding_set)) { + r = capability_bounding_set_drop_usermode(arg_capability_bounding_set); + if (r < 0) { + *ret_error_message = "Failed to drop capability bounding set of usermode helpers"; + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to drop capability bounding set of usermode helpers: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR); + } + + r = capability_bounding_set_drop(arg_capability_bounding_set, true); + if (r < 0) { + *ret_error_message = "Failed to drop capability bounding set"; + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to drop capability bounding set: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR); + } + } + + if (arg_no_new_privs) { + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + *ret_error_message = "Failed to disable new privileges"; + return log_struct_errno(LOG_EMERG, errno, + LOG_MESSAGE("Failed to disable new privileges: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR); + } + } + } + + if (arg_syscall_archs) { + r = enforce_syscall_archs(arg_syscall_archs); + if (r < 0) { + *ret_error_message = "Failed to set syscall architectures"; + return r; + } + } + + r = make_reaper_process(true); + if (r < 0) + log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m"); + + /* Bump up RLIMIT_NOFILE for systemd itself */ + (void) bump_rlimit_nofile(saved_rlimit_nofile); + (void) bump_rlimit_memlock(saved_rlimit_memlock); + + return 0; +} + +static int do_queue_default_job( + Manager *m, + const char **ret_error_message) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *unit; + Job *job; + Unit *target; + int r; + + if (arg_default_unit) + unit = arg_default_unit; + else if (in_initrd()) + unit = SPECIAL_INITRD_TARGET; + else + unit = SPECIAL_DEFAULT_TARGET; + + log_debug("Activating default unit: %s", unit); + + r = manager_load_startable_unit_or_warn(m, unit, NULL, &target); + if (r < 0 && in_initrd() && !arg_default_unit) { + /* Fall back to default.target, which we used to always use by default. Only do this if no + * explicit configuration was given. */ + + log_info("Falling back to " SPECIAL_DEFAULT_TARGET "."); + + r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target); + } + if (r < 0) { + log_info("Falling back to " SPECIAL_RESCUE_TARGET "."); + + r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target); + if (r < 0) { + *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked" + : "Failed to load " SPECIAL_RESCUE_TARGET; + return r; + } + } + + assert(target->load_state == UNIT_LOADED); + + r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job); + if (r == -EPERM) { + log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r)); + + sd_bus_error_free(&error); + + r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job); + if (r < 0) { + *ret_error_message = "Failed to start default target"; + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to start default target: %s", bus_error_message(&error, r)), + "MESSAGE_ID=" SD_MESSAGE_CORE_START_TARGET_FAILED_STR); + } + + } else if (r < 0) { + *ret_error_message = "Failed to isolate default target"; + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to isolate default target: %s", bus_error_message(&error, r)), + "MESSAGE_ID=" SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR); + } else + log_info("Queued %s job for default target %s.", + job_type_to_string(job->type), + unit_status_string(job->unit, NULL)); + + m->default_unit_job_id = job->id; + + return 0; +} + +static void save_rlimits(struct rlimit *saved_rlimit_nofile, + struct rlimit *saved_rlimit_memlock) { + + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + + if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0) + log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m"); + + if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0) + log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m"); +} + +static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) { + struct rlimit *rl; + + if (arg_defaults.rlimit[RLIMIT_NOFILE]) + return; + + /* Make sure forked processes get limits based on the original kernel setting */ + + rl = newdup(struct rlimit, saved_rlimit_nofile, 1); + if (!rl) { + log_oom(); + return; + } + + /* Bump the hard limit for system services to a substantially higher value. The default + * hard limit current kernels set is pretty low (4K), mostly for historical + * reasons. According to kernel developers, the fd handling in recent kernels has been + * optimized substantially enough, so that we can bump the limit now, without paying too + * high a price in memory or performance. Note however that we only bump the hard limit, + * not the soft limit. That's because select() works the way it works, and chokes on fds + * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to + * unexpecting programs that they get fds higher than what they can process using + * select(). By only bumping the hard limit but leaving the low limit as it is we avoid + * this pitfall: programs that are written by folks aware of the select() problem in mind + * (and thus use poll()/epoll instead of select(), the way everybody should) can + * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit + * we pass. */ + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) { + int nr; + + /* Get the underlying absolute limit the kernel enforces */ + nr = read_nr_open(); + + rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE)); + } + + /* If for some reason we were invoked with a soft limit above 1024 (which should never + * happen!, but who knows what we get passed in from pam_limit when invoked as --user + * instance), then lower what we pass on to not confuse our children */ + rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE); + + arg_defaults.rlimit[RLIMIT_NOFILE] = rl; +} + +static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) { + struct rlimit *rl; + + /* Pass the original value down to invoked processes */ + + if (arg_defaults.rlimit[RLIMIT_MEMLOCK]) + return; + + rl = newdup(struct rlimit, saved_rlimit_memlock, 1); + if (!rl) { + log_oom(); + return; + } + + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) { + /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel + * default for this since kernel 5.16) */ + rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK); + rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK); + } + + arg_defaults.rlimit[RLIMIT_MEMLOCK] = rl; +} + +static void setenv_manager_environment(void) { + int r; + + STRV_FOREACH(p, arg_manager_environment) { + log_debug("Setting '%s' in our own environment.", *p); + + r = putenv_dup(*p, true); + if (r < 0) + log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p); + } +} + +static void reset_arguments(void) { + /* Frees/resets arg_* variables, with a few exceptions commented below. */ + + arg_default_unit = mfree(arg_default_unit); + + /* arg_runtime_scope — ignore */ + + arg_dump_core = true; + arg_crash_chvt = -1; + arg_crash_shell = false; + arg_crash_reboot = false; + arg_confirm_spawn = mfree(arg_confirm_spawn); + arg_show_status = _SHOW_STATUS_INVALID; + arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT; + arg_switched_root = false; + arg_pager_flags = 0; + arg_service_watchdogs = true; + + unit_defaults_done(&arg_defaults); + unit_defaults_init(&arg_defaults, arg_runtime_scope); + + arg_runtime_watchdog = 0; + arg_reboot_watchdog = 10 * USEC_PER_MINUTE; + arg_kexec_watchdog = 0; + arg_pretimeout_watchdog = 0; + arg_early_core_pattern = mfree(arg_early_core_pattern); + arg_watchdog_device = mfree(arg_watchdog_device); + arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor); + + arg_default_environment = strv_free(arg_default_environment); + arg_manager_environment = strv_free(arg_manager_environment); + + arg_capability_bounding_set = CAP_MASK_UNSET; + arg_no_new_privs = false; + arg_timer_slack_nsec = NSEC_INFINITY; + + arg_syscall_archs = set_free(arg_syscall_archs); + + /* arg_serialization — ignore */ + + arg_machine_id = (sd_id128_t) {}; + arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE; + + cpu_set_reset(&arg_cpu_affinity); + numa_policy_reset(&arg_numa_policy); + + arg_random_seed = mfree(arg_random_seed); + arg_random_seed_size = 0; + arg_clock_usec = 0; + + arg_reload_limit_interval_sec = 0; + arg_reload_limit_burst = 0; +} + +static void determine_default_oom_score_adjust(void) { + int r, a, b; + + /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and + * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged + * user). */ + + if (arg_defaults.oom_score_adjust_set) + return; + + if (getuid() == 0) + return; + + r = get_oom_score_adjust(&a); + if (r < 0) + return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m"); + + assert_cc(100 <= OOM_SCORE_ADJ_MAX); + b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100; + + if (a == b) + return; + + arg_defaults.oom_score_adjust = b; + arg_defaults.oom_score_adjust_set = true; +} + +static int parse_configuration(const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock) { + int r; + + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + + /* Assign configuration defaults */ + reset_arguments(); + + r = parse_config_file(); + if (r < 0) + log_warning_errno(r, "Failed to parse config file, ignoring: %m"); + + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) { + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + } + + /* Initialize some default rlimits for services if they haven't been configured */ + fallback_rlimit_nofile(saved_rlimit_nofile); + fallback_rlimit_memlock(saved_rlimit_memlock); + + /* Note that this also parses bits from the kernel command line, including "debug". */ + log_parse_environment(); + + /* Initialize the show status setting if it hasn't been set explicitly yet */ + if (arg_show_status == _SHOW_STATUS_INVALID) + arg_show_status = SHOW_STATUS_YES; + + /* Slightly raise the OOM score for our services if we are running for unprivileged users. */ + determine_default_oom_score_adjust(); + + /* Push variables into the manager environment block */ + setenv_manager_environment(); + + /* Parse log environment variables again to take into account any new environment variables. */ + log_parse_environment(); + + return 0; +} + +static int safety_checks(void) { + + if (getpid_cached() == 1 && + arg_action != ACTION_RUN) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Unsupported execution mode while PID 1."); + + if (getpid_cached() == 1 && + arg_runtime_scope == RUNTIME_SCOPE_USER) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run --user mode as PID 1."); + + if (arg_action == ACTION_RUN && + arg_runtime_scope == RUNTIME_SCOPE_SYSTEM && + getpid_cached() != 1) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run system mode unless PID 1."); + + if (arg_action == ACTION_TEST && + geteuid() == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Don't run test mode as root."); + + switch (arg_runtime_scope) { + + case RUNTIME_SCOPE_USER: + + if (arg_action == ACTION_RUN && + sd_booted() <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Trying to run as user instance, but the system has not been booted with systemd."); + + if (arg_action == ACTION_RUN && + !getenv("XDG_RUNTIME_DIR")) + return log_error_errno(SYNTHETIC_ERRNO(EUNATCH), + "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set."); + + break; + + case RUNTIME_SCOPE_SYSTEM: + if (arg_action == ACTION_RUN && + running_in_chroot() > 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cannot be run in a chroot() environment."); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int initialize_security( + bool *loaded_policy, + dual_timestamp *security_start_timestamp, + dual_timestamp *security_finish_timestamp, + const char **ret_error_message) { + + int r; + + assert(loaded_policy); + assert(security_start_timestamp); + assert(security_finish_timestamp); + assert(ret_error_message); + + dual_timestamp_now(security_start_timestamp); + + r = mac_selinux_setup(loaded_policy); + if (r < 0) { + *ret_error_message = "Failed to load SELinux policy"; + return r; + } + + r = mac_smack_setup(loaded_policy); + if (r < 0) { + *ret_error_message = "Failed to load SMACK policy"; + return r; + } + + r = mac_apparmor_setup(); + if (r < 0) { + *ret_error_message = "Failed to load AppArmor policy"; + return r; + } + + r = ima_setup(); + if (r < 0) { + *ret_error_message = "Failed to load IMA policy"; + return r; + } + + dual_timestamp_now(security_finish_timestamp); + return 0; +} + +static int collect_fds(FDSet **ret_fds, const char **ret_error_message) { + int r; + + assert(ret_fds); + assert(ret_error_message); + + /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC + * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies + * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe + * mechanism to distinguish passed in fds from our own. + * + * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our + * process behind our back. We should not take possession of that (and then accidentally close + * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */ + r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds); + if (r < 0) { + *ret_error_message = "Failed to allocate fd set"; + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to allocate fd set: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_FD_SET_FAILED_STR); + } + + /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */ + assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization))); + + return 0; +} + +static void setup_console_terminal(bool skip_setup) { + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + return; + + /* Become a session leader if we aren't one yet. */ + (void) setsid(); + + /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a + * controlling tty. */ + (void) release_terminal(); + + /* Reset the console, but only if this is really init and we are freshly booted */ + if (getpid_cached() == 1 && !skip_setup) + (void) console_setup(); +} + +static bool early_skip_setup_check(int argc, char *argv[]) { + bool found_deserialize = false; + + /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much + * later, so let's just have a quick peek here. Note that if we have switched root, do all the + * special setup things anyway, even if in that case we also do deserialization. */ + + for (int i = 1; i < argc; i++) + if (streq(argv[i], "--switched-root")) + return false; /* If we switched root, don't skip the setup. */ + else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize")) + found_deserialize = true; + + return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */ +} + +static int save_env(void) { + char **l; + + l = strv_copy(environ); + if (!l) + return -ENOMEM; + + strv_free_and_replace(saved_env, l); + return 0; +} + +int main(int argc, char *argv[]) { + dual_timestamp + initrd_timestamp = DUAL_TIMESTAMP_NULL, + userspace_timestamp = DUAL_TIMESTAMP_NULL, + kernel_timestamp = DUAL_TIMESTAMP_NULL, + security_start_timestamp = DUAL_TIMESTAMP_NULL, + security_finish_timestamp = DUAL_TIMESTAMP_NULL; + struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0), + saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed + * in. Note we use different values + * for the two that indicate whether + * these fields are initialized! */ + bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false; + char *switch_root_dir = NULL, *switch_root_init = NULL; + usec_t before_startup, after_startup; + static char systemd[] = "systemd"; + const char *error_message = NULL; + int r, retval = EXIT_FAILURE; + Manager *m = NULL; + FDSet *fds = NULL; + + assert_se(argc > 0 && !isempty(argv[0])); + + /* SysV compatibility: redirect init → telinit */ + redirect_telinit(argc, argv); + + /* Take timestamps early on */ + dual_timestamp_from_monotonic(&kernel_timestamp, 0); + dual_timestamp_now(&userspace_timestamp); + + /* Figure out whether we need to do initialize the system, or if we already did that because we are + * reexecuting. */ + skip_setup = early_skip_setup_check(argc, argv); + + /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent + * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd + * right-away. */ + program_invocation_short_name = systemd; + (void) prctl(PR_SET_NAME, systemd); + + /* Save the original command line */ + save_argc_argv(argc, argv); + + /* Save the original environment as we might need to restore it if we're requested to execute another + * system manager later. */ + r = save_env(); + if (r < 0) { + error_message = "Failed to copy environment block"; + goto finish; + } + + /* Make sure that if the user says "syslog" we actually log to the journal. */ + log_set_upgrade_syslog_to_journal(true); + + if (getpid_cached() == 1) { + /* When we run as PID 1 force system mode */ + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + + /* Disable the umask logic */ + umask(0); + + /* Make sure that at least initially we do not ever log to journald/syslogd, because it might + * not be activated yet (even though the log socket for it exists). */ + log_set_prohibit_ipc(true); + + /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This + * is important so that we never end up logging to any foreign stderr, for example if we have + * to log in a child process right before execve()'ing the actual binary, at a point in time + * where socket activation stderr/stdout area already set up. */ + log_set_always_reopen_console(true); + + if (detect_container() <= 0) { + + /* Running outside of a container as PID 1 */ + log_set_target_and_open(LOG_TARGET_KMSG); + + if (in_initrd()) + initrd_timestamp = userspace_timestamp; + + if (!skip_setup) { + r = mount_setup_early(); + if (r < 0) { + error_message = "Failed to mount early API filesystems"; + goto finish; + } + } + + /* We might have just mounted /proc, so let's try to parse the kernel + * command line log arguments immediately. */ + log_parse_environment(); + + /* Let's open the log backend a second time, in case the first time didn't + * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became + * available, and it previously wasn't. */ + log_open(); + + if (!skip_setup) { + disable_printk_ratelimit(); + + r = initialize_security( + &loaded_policy, + &security_start_timestamp, + &security_finish_timestamp, + &error_message); + if (r < 0) + goto finish; + } + + if (mac_init() < 0) { + error_message = "Failed to initialize MAC support"; + goto finish; + } + + if (!skip_setup) + initialize_clock(); + + /* Set the default for later on, but don't actually open the logs like this for + * now. Note that if we are transitioning from the initrd there might still be + * journal fd open, and we shouldn't attempt opening that before we parsed + * /proc/cmdline which might redirect output elsewhere. */ + log_set_target(LOG_TARGET_JOURNAL_OR_KMSG); + + } else { + /* Running inside a container, as PID 1 */ + log_set_target_and_open(LOG_TARGET_CONSOLE); + + /* For later on, see above... */ + log_set_target(LOG_TARGET_JOURNAL); + + /* clear the kernel timestamp, because we are in a container */ + kernel_timestamp = DUAL_TIMESTAMP_NULL; + } + + initialize_coredump(skip_setup); + + r = fixup_environment(); + if (r < 0) { + log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to fix up PID 1 environment: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR); + error_message = "Failed to fix up PID1 environment"; + goto finish; + } + + /* Try to figure out if we can use colors with the console. No need to do that for user + * instances since they never log into the console. */ + log_show_color(colors_enabled()); + + r = make_null_stdio(); + if (r < 0) + log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m"); + + /* Load the kernel modules early. */ + if (!skip_setup) + (void) kmod_setup(); + + /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */ + r = mount_setup(loaded_policy, skip_setup); + if (r < 0) { + error_message = "Failed to mount API filesystems"; + goto finish; + } + + /* The efivarfs is now mounted, let's lock down the system token. */ + lock_down_efi_variables(); + + /* Cache command-line options passed from EFI variables */ + if (!skip_setup) + (void) cache_efi_options_variable(); + } else { + /* Running as user instance */ + arg_runtime_scope = RUNTIME_SCOPE_USER; + log_set_always_reopen_console(true); + log_set_target_and_open(LOG_TARGET_AUTO); + + /* clear the kernel timestamp, because we are not PID 1 */ + kernel_timestamp = DUAL_TIMESTAMP_NULL; + + /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does + * not affect the permitted and effective sets which are important for the manager itself to + * operate. */ + capability_ambient_set_apply(0, /* also_inherit= */ false); + + if (mac_init() < 0) { + error_message = "Failed to initialize MAC support"; + goto finish; + } + } + + /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when + * transitioning from the initrd to the main systemd or suchlike. */ + save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock); + + /* Reset all signal handlers. */ + (void) reset_all_signal_handlers(); + (void) ignore_signals(SIGNALS_IGNORE); + + (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock); + + r = parse_argv(argc, argv); + if (r < 0) { + error_message = "Failed to parse command line arguments"; + goto finish; + } + + r = safety_checks(); + if (r < 0) + goto finish; + + if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT)) + pager_open(arg_pager_flags); + + if (arg_action != ACTION_RUN) + skip_setup = true; + + if (arg_action == ACTION_HELP) { + retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_VERSION) { + retval = version(); + goto finish; + } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) { + unit_dump_config_items(stdout); + retval = EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) { + dump_bus_properties(stdout); + retval = EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_BUS_INTROSPECT) { + r = bus_manager_introspect_implementations(stdout, arg_bus_introspect); + retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE; + goto finish; + } + + assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST)); + + /* Move out of the way, so that we won't block unmounts */ + assert_se(chdir("/") == 0); + + if (arg_action == ACTION_RUN) { + if (!skip_setup) { + /* Apply the systemd.clock_usec= kernel command line switch */ + apply_clock_update(); + + /* Apply random seed from kernel command line */ + cmdline_take_random_seed(); + } + + /* A core pattern might have been specified via the cmdline. */ + initialize_core_pattern(skip_setup); + + /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */ + log_close(); + + /* Remember open file descriptors for later deserialization */ + r = collect_fds(&fds, &error_message); + if (r < 0) + goto finish; + + /* Give up any control of the console, but make sure its initialized. */ + setup_console_terminal(skip_setup); + + /* Open the logging devices, if possible and necessary */ + log_open(); + } + + log_execution_mode(&first_boot); + + r = initialize_runtime(skip_setup, + first_boot, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + &error_message); + if (r < 0) + goto finish; + + r = manager_new(arg_runtime_scope, + arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0, + &m); + if (r < 0) { + log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to allocate manager object: %m"), + "MESSAGE_ID=" SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR); + error_message = "Failed to allocate manager object"; + goto finish; + } + + m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp; + m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp; + m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp; + + set_manager_defaults(m); + set_manager_settings(m); + manager_set_first_boot(m, first_boot); + manager_set_switching_root(m, arg_switched_root); + + /* Remember whether we should queue the default job */ + queue_default_job = !arg_serialization || arg_switched_root; + + before_startup = now(CLOCK_MONOTONIC); + + r = manager_startup(m, arg_serialization, fds, /* root= */ NULL); + if (r < 0) { + error_message = "Failed to start up manager"; + goto finish; + } + + /* This will close all file descriptors that were opened, but not claimed by any unit. */ + fds = fdset_free(fds); + arg_serialization = safe_fclose(arg_serialization); + + if (queue_default_job) { + r = do_queue_default_job(m, &error_message); + if (r < 0) + goto finish; + } + + after_startup = now(CLOCK_MONOTONIC); + + log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG, + "Loaded units and determined initial transaction in %s.", + FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC)); + + if (arg_action == ACTION_TEST) { + manager_test_summary(m); + retval = EXIT_SUCCESS; + goto finish; + } + + r = invoke_main_loop(m, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + &retval, + &fds, + &switch_root_dir, + &switch_root_init, + &error_message); + assert(r < 0 || IN_SET(r, MANAGER_EXIT, /* MANAGER_OK is not expected here. */ + MANAGER_RELOAD, + MANAGER_REEXECUTE, + MANAGER_REBOOT, + MANAGER_SOFT_REBOOT, + MANAGER_POWEROFF, + MANAGER_HALT, + MANAGER_KEXEC, + MANAGER_SWITCH_ROOT)); + +finish: + pager_close(); + + if (m) { + arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT); + arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC); + m = manager_free(m); + } + + mac_selinux_finish(); + + if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT)) + r = do_reexecute(r, + argc, argv, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + fds, + switch_root_dir, + switch_root_init, + &error_message); /* This only returns if reexecution failed */ + + arg_serialization = safe_fclose(arg_serialization); + fds = fdset_free(fds); + + saved_env = strv_free(saved_env); + +#if HAVE_VALGRIND_VALGRIND_H + /* If we are PID 1 and running under valgrind, then let's exit + * here explicitly. valgrind will only generate nice output on + * exit(), not on exec(), hence let's do the former not the + * latter here. */ + if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) { + /* Cleanup watchdog_device strings for valgrind. We need them + * in become_shutdown() so normally we cannot free them yet. */ + watchdog_free_device(); + reset_arguments(); + return retval; + } +#endif + +#if HAS_FEATURE_ADDRESS_SANITIZER + /* At this stage we most likely don't have stdio/stderr open, so the following + * LSan check would not print any actionable information and would just crash + * PID 1. To make this a bit more helpful, let's try to open /dev/console, + * and if we succeed redirect LSan's report there. */ + if (getpid_cached() == 1) { + _cleanup_close_ int tty_fd = -EBADF; + + tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (tty_fd >= 0) + __sanitizer_set_report_fd((void*) (intptr_t) tty_fd); + + __lsan_do_leak_check(); + } +#endif + + if (r < 0) + (void) sd_notifyf(0, "ERRNO=%i", -r); + + /* Try to invoke the shutdown binary unless we already failed. + * If we failed above, we want to freeze after finishing cleanup. */ + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM && + IN_SET(r, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)) { + r = become_shutdown(r, retval); + log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting"); + error_message = "Failed to execute shutdown binary"; + } + + /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with + * a mechanism to pick up systemd's exit status in the VM. */ + (void) sd_notifyf(0, "EXIT_STATUS=%i", retval); + + watchdog_free_device(); + arg_watchdog_device = mfree(arg_watchdog_device); + + if (getpid_cached() == 1) { + if (error_message) + manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL, + "%s.", error_message); + freeze_or_exit_or_reboot(); + } + + reset_arguments(); + return retval; +} diff --git a/src/core/main.h b/src/core/main.h new file mode 100644 index 0000000..b12a1cc --- /dev/null +++ b/src/core/main.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +extern bool arg_dump_core; +extern int arg_crash_chvt; +extern bool arg_crash_shell; +extern bool arg_crash_reboot; diff --git a/src/core/manager-dump.c b/src/core/manager-dump.c new file mode 100644 index 0000000..6c32d78 --- /dev/null +++ b/src/core/manager-dump.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "build.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "manager-dump.h" +#include "memstream-util.h" +#include "unit-serialize.h" +#include "version.h" + +void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix) { + Job *j; + + assert(s); + assert(f); + + HASHMAP_FOREACH(j, s->jobs) { + + if (!strv_fnmatch_or_empty(patterns, j->unit->id, FNM_NOESCAPE)) + continue; + + job_dump(j, f, prefix); + } +} + +int manager_get_dump_jobs_string(Manager *m, char **patterns, const char *prefix, char **ret) { + _cleanup_(memstream_done) MemStream ms = {}; + FILE *f; + + assert(m); + assert(ret); + + f = memstream_init(&ms); + if (!f) + return -errno; + + manager_dump_jobs(m, f, patterns, prefix); + + return memstream_finalize(&ms, ret, NULL); +} + +void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix) { + Unit *u; + const char *t; + + assert(s); + assert(f); + + HASHMAP_FOREACH_KEY(u, t, s->units) { + if (u->id != t) + continue; + + if (!strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE)) + continue; + + unit_dump(u, f, prefix); + } +} + +static void manager_dump_header(Manager *m, FILE *f, const char *prefix) { + + /* NB: this is a debug interface for developers. It's not supposed to be machine readable or be + * stable between versions. We take the liberty to restructure it entirely between versions and + * add/remove fields at will. */ + + fprintf(f, "%sManager: systemd " STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")\n", strempty(prefix)); + fprintf(f, "%sFeatures: %s\n", strempty(prefix), systemd_features); + + for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + const dual_timestamp *t = m->timestamps + q; + + if (dual_timestamp_is_set(t)) + fprintf(f, "%sTimestamp %s: %s\n", + strempty(prefix), + manager_timestamp_to_string(q), + timestamp_is_set(t->realtime) ? FORMAT_TIMESTAMP(t->realtime) : + FORMAT_TIMESPAN(t->monotonic, 1)); + } +} + +void manager_dump(Manager *m, FILE *f, char **patterns, const char *prefix) { + assert(m); + assert(f); + + /* If no pattern is provided, dump the full manager state including the manager version, features and + * so on. Otherwise limit the dump to the units/jobs matching the specified patterns. */ + if (!patterns) + manager_dump_header(m, f, prefix); + + manager_dump_units(m, f, patterns, prefix); + manager_dump_jobs(m, f, patterns, prefix); +} + +int manager_get_dump_string(Manager *m, char **patterns, char **ret) { + _cleanup_(memstream_done) MemStream ms = {}; + FILE *f; + + assert(m); + assert(ret); + + f = memstream_init(&ms); + if (!f) + return -errno; + + manager_dump(m, f, patterns, NULL); + + return memstream_finalize(&ms, ret, NULL); +} + +void manager_test_summary(Manager *m) { + assert(m); + + printf("-> By units:\n"); + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("-> By jobs:\n"); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); +} diff --git a/src/core/manager-dump.h b/src/core/manager-dump.h new file mode 100644 index 0000000..5b96f26 --- /dev/null +++ b/src/core/manager-dump.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "manager.h" + +void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix); +int manager_get_dump_jobs_string(Manager *m, char **patterns, const char *prefix, char **ret); +void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix); +void manager_dump(Manager *s, FILE *f, char **patterns, const char *prefix); +int manager_get_dump_string(Manager *m, char **patterns, char **ret); +void manager_test_summary(Manager *m); diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c new file mode 100644 index 0000000..e9d567a --- /dev/null +++ b/src/core/manager-serialize.c @@ -0,0 +1,539 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "clean-ipc.h" +#include "core-varlink.h" +#include "dbus.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "initrd-util.h" +#include "macro.h" +#include "manager-serialize.h" +#include "manager.h" +#include "parse-util.h" +#include "serialize.h" +#include "syslog-util.h" +#include "unit-serialize.h" +#include "user-util.h" +#include "varlink-internal.h" + +int manager_open_serialization(Manager *m, FILE **ret_f) { + assert(ret_f); + + return open_serialization_file("systemd-state", ret_f); +} + +static bool manager_timestamp_shall_serialize(ManagerTimestamp t) { + if (!in_initrd()) + return true; + + /* The following timestamps only apply to the host system, hence only serialize them there */ + return !IN_SET(t, + MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH, + MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH, + MANAGER_TIMESTAMP_GENERATORS_START, MANAGER_TIMESTAMP_GENERATORS_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD_START, MANAGER_TIMESTAMP_UNITS_LOAD_FINISH); +} + +static void manager_serialize_uid_refs_internal( + FILE *f, + Hashmap *uid_refs, + const char *field_name) { + + void *p, *k; + + assert(f); + assert(field_name); + + /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as + * the actual counter of it is better rebuild after a reload/reexec. */ + + HASHMAP_FOREACH_KEY(p, k, uid_refs) { + uint32_t c; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + if (!(c & DESTROY_IPC_FLAG)) + continue; + + (void) serialize_item_format(f, field_name, UID_FMT, uid); + } +} + +static void manager_serialize_uid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(f, m->uid_refs, "destroy-ipc-uid"); +} + +static void manager_serialize_gid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(f, m->gid_refs, "destroy-ipc-gid"); +} + +int manager_serialize( + Manager *m, + FILE *f, + FDSet *fds, + bool switching_root) { + + const char *t; + Unit *u; + int r; + + assert(m); + assert(f); + assert(fds); + + _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m); + + (void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id); + (void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs); + (void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs); + (void) serialize_bool(f, "ready-sent", m->ready_sent); + (void) serialize_bool(f, "taint-logged", m->taint_logged); + (void) serialize_bool(f, "service-watchdogs", m->service_watchdogs); + + if (m->show_status_overridden != _SHOW_STATUS_INVALID) + (void) serialize_item(f, "show-status-overridden", + show_status_to_string(m->show_status_overridden)); + + if (m->log_level_overridden) + (void) serialize_item_format(f, "log-level-override", "%i", log_get_max_level()); + if (m->log_target_overridden) + (void) serialize_item(f, "log-target-override", log_target_to_string(log_get_target())); + + (void) serialize_usec(f, "runtime-watchdog-overridden", m->watchdog_overridden[WATCHDOG_RUNTIME]); + (void) serialize_usec(f, "reboot-watchdog-overridden", m->watchdog_overridden[WATCHDOG_REBOOT]); + (void) serialize_usec(f, "kexec-watchdog-overridden", m->watchdog_overridden[WATCHDOG_KEXEC]); + (void) serialize_usec(f, "pretimeout-watchdog-overridden", m->watchdog_overridden[WATCHDOG_PRETIMEOUT]); + (void) serialize_item(f, "pretimeout-watchdog-governor-overridden", m->watchdog_pretimeout_governor_overridden); + + for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + _cleanup_free_ char *joined = NULL; + + if (!manager_timestamp_shall_serialize(q)) + continue; + + joined = strjoin(manager_timestamp_to_string(q), "-timestamp"); + if (!joined) + return log_oom(); + + (void) serialize_dual_timestamp(f, joined, m->timestamps + q); + } + + if (!switching_root) + (void) serialize_strv(f, "env", m->client_environment); + + if (m->notify_fd >= 0) { + r = serialize_fd(f, fds, "notify-fd", m->notify_fd); + if (r < 0) + return r; + + (void) serialize_item(f, "notify-socket", m->notify_socket); + } + + if (m->cgroups_agent_fd >= 0) { + r = serialize_fd(f, fds, "cgroups-agent-fd", m->cgroups_agent_fd); + if (r < 0) + return r; + } + + if (m->user_lookup_fds[0] >= 0) { + int copy0, copy1; + + copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]); + if (copy0 < 0) + return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m"); + + copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]); + if (copy1 < 0) + return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m"); + + (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1); + } + + (void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit); + + bus_track_serialize(m->subscribed, f, "subscribed"); + + r = dynamic_user_serialize(m, f, fds); + if (r < 0) + return r; + + manager_serialize_uid_refs(m, f); + manager_serialize_gid_refs(m, f); + + r = exec_shared_runtime_serialize(m, f, fds); + if (r < 0) + return r; + + r = varlink_server_serialize(m->varlink_server, f, fds); + if (r < 0) + return r; + + (void) fputc('\n', f); + + HASHMAP_FOREACH_KEY(u, t, m->units) { + if (u->id != t) + continue; + + r = unit_serialize_state(u, f, fds, switching_root); + if (r < 0) + return r; + } + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to flush serialization: %m"); + + r = bus_fdset_add_all(m, fds); + if (r < 0) + return log_error_errno(r, "Failed to add bus sockets to serialization: %m"); + + return 0; +} + +static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) { + Unit *u; + int r; + + r = manager_load_unit(m, name, NULL, NULL, &u); + if (r < 0) { + if (r == -ENOMEM) + return r; + return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name); + } + + r = unit_deserialize_state(u, f, fds); + if (r < 0) { + if (r == -ENOMEM) + return r; + return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name); + } + + return 0; +} + +static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) { + int r; + + for (;;) { + _cleanup_free_ char *line = NULL; + + /* Start marker */ + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + break; + + r = manager_deserialize_one_unit(m, line, f, fds); + if (r == -ENOMEM) + return r; + if (r < 0) { + r = unit_deserialize_state_skip(f); + if (r < 0) + return r; + } + } + + return 0; +} + +static void manager_deserialize_uid_refs_one_internal( + Hashmap** uid_refs, + const char *value) { + + uid_t uid; + uint32_t c; + int r; + + assert(uid_refs); + assert(value); + + r = parse_uid(value, &uid); + if (r < 0 || uid == 0) { + log_debug("Unable to parse UID/GID reference serialization: %s", value); + return; + } + + if (hashmap_ensure_allocated(uid_refs, &trivial_hash_ops) < 0) { + log_oom(); + return; + } + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + if (c & DESTROY_IPC_FLAG) + return; + + c |= DESTROY_IPC_FLAG; + + r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); + if (r < 0) { + log_debug_errno(r, "Failed to add UID/GID reference entry: %m"); + return; + } +} + +static void manager_deserialize_uid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(&m->uid_refs, value); +} + +static void manager_deserialize_gid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(&m->gid_refs, value); +} + +int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { + bool deserialize_varlink_sockets = false; + int r = 0; + + assert(m); + assert(f); + + if (DEBUG_LOGGING) { + if (fdset_isempty(fds)) + log_debug("No file descriptors passed"); + else { + int fd; + + FDSET_FOREACH(fd, fds) { + _cleanup_free_ char *fn = NULL; + + r = fd_get_path(fd, &fn); + if (r < 0) + log_debug_errno(r, "Received serialized fd %i %s %m", + fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT)); + else + log_debug("Received serialized fd %i %s %s", + fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), strna(fn)); + } + } + } + + log_debug("Deserializing state..."); + + /* If we are not in reload mode yet, enter it now. Not that this is recursive, a caller might already have + * increased it to non-zero, which is why we just increase it by one here and down again at the end of this + * call. */ + _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m); + + for (;;) { + _cleanup_free_ char *l = NULL; + const char *val; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + if ((val = startswith(l, "current-job-id="))) { + uint32_t id; + + if (safe_atou32(val, &id) < 0) + log_notice("Failed to parse current job id value '%s', ignoring.", val); + else + m->current_job_id = MAX(m->current_job_id, id); + + } else if ((val = startswith(l, "n-installed-jobs="))) { + uint32_t n; + + if (safe_atou32(val, &n) < 0) + log_notice("Failed to parse installed jobs counter '%s', ignoring.", val); + else + m->n_installed_jobs += n; + + } else if ((val = startswith(l, "n-failed-jobs="))) { + uint32_t n; + + if (safe_atou32(val, &n) < 0) + log_notice("Failed to parse failed jobs counter '%s', ignoring.", val); + else + m->n_failed_jobs += n; + + } else if ((val = startswith(l, "ready-sent="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse ready-sent flag '%s', ignoring.", val); + else + m->ready_sent = m->ready_sent || b; + + } else if ((val = startswith(l, "taint-logged="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse taint-logged flag '%s', ignoring.", val); + else + m->taint_logged = m->taint_logged || b; + + } else if ((val = startswith(l, "service-watchdogs="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val); + else + m->service_watchdogs = b; + + } else if ((val = startswith(l, "show-status-overridden="))) { + ShowStatus s; + + s = show_status_from_string(val); + if (s < 0) + log_notice("Failed to parse show-status-overridden flag '%s', ignoring.", val); + else + manager_override_show_status(m, s, "deserialize"); + + } else if ((val = startswith(l, "log-level-override="))) { + int level; + + level = log_level_from_string(val); + if (level < 0) + log_notice("Failed to parse log-level-override value '%s', ignoring.", val); + else + manager_override_log_level(m, level); + + } else if ((val = startswith(l, "log-target-override="))) { + LogTarget target; + + target = log_target_from_string(val); + if (target < 0) + log_notice("Failed to parse log-target-override value '%s', ignoring.", val); + else + manager_override_log_target(m, target); + + } else if ((val = startswith(l, "runtime-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse runtime-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_RUNTIME, t); + + } else if ((val = startswith(l, "reboot-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse reboot-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_REBOOT, t); + + } else if ((val = startswith(l, "kexec-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse kexec-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_KEXEC, t); + + } else if ((val = startswith(l, "pretimeout-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse pretimeout-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_PRETIMEOUT, t); + + } else if ((val = startswith(l, "pretimeout-watchdog-governor-overridden="))) { + r = free_and_strdup(&m->watchdog_pretimeout_governor_overridden, val); + if (r < 0) + return r; + + } else if (startswith(l, "env=")) { + r = deserialize_environment(l + 4, &m->client_environment); + if (r < 0) + log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l); + + } else if ((val = startswith(l, "notify-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd >= 0) { + m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source); + safe_close(m->notify_fd); + m->notify_fd = fd; + } + + } else if ((val = startswith(l, "notify-socket="))) { + r = free_and_strdup(&m->notify_socket, val); + if (r < 0) + return r; + + } else if ((val = startswith(l, "cgroups-agent-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd >= 0) { + m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source); + safe_close(m->cgroups_agent_fd); + m->cgroups_agent_fd = fd; + } + + } else if ((val = startswith(l, "user-lookup="))) { + int fd0, fd1; + + if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1)) + log_notice("Failed to parse user lookup fd, ignoring: %s", val); + else { + m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source); + safe_close_pair(m->user_lookup_fds); + m->user_lookup_fds[0] = fdset_remove(fds, fd0); + m->user_lookup_fds[1] = fdset_remove(fds, fd1); + } + + } else if ((val = startswith(l, "dynamic-user="))) + dynamic_user_deserialize_one(m, val, fds, NULL); + else if ((val = startswith(l, "destroy-ipc-uid="))) + manager_deserialize_uid_refs_one(m, val); + else if ((val = startswith(l, "destroy-ipc-gid="))) + manager_deserialize_gid_refs_one(m, val); + else if ((val = startswith(l, "exec-runtime="))) + (void) exec_shared_runtime_deserialize_one(m, val, fds); + else if ((val = startswith(l, "subscribed="))) { + + if (strv_extend(&m->deserialized_subscribed, val) < 0) + return -ENOMEM; + } else if ((val = startswith(l, "varlink-server-socket-address="))) { + if (!m->varlink_server && MANAGER_IS_SYSTEM(m)) { + r = manager_varlink_init(m); + if (r < 0) { + log_warning_errno(r, "Failed to setup varlink server, ignoring: %m"); + continue; + } + + deserialize_varlink_sockets = true; + } + + /* To avoid unnecessary deserialization (i.e. during reload vs. reexec) we only deserialize + * the FDs if we had to create a new m->varlink_server. The deserialize_varlink_sockets flag + * is initialized outside of the loop, is flipped after the VarlinkServer is setup, and + * remains set until all serialized contents are handled. */ + if (deserialize_varlink_sockets) + (void) varlink_server_deserialize_one(m->varlink_server, val, fds); + } else if ((val = startswith(l, "dump-ratelimit="))) + deserialize_ratelimit(&m->dump_ratelimit, "dump-ratelimit", val); + else { + ManagerTimestamp q; + + for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + val = startswith(l, manager_timestamp_to_string(q)); + if (!val) + continue; + + val = startswith(val, "-timestamp="); + if (val) + break; + } + + if (q < _MANAGER_TIMESTAMP_MAX) /* found it */ + (void) deserialize_dual_timestamp(val, m->timestamps + q); + else if (!STARTSWITH_SET(l, "kdbus-fd=", "honor-device-enumeration=")) /* ignore deprecated values */ + log_notice("Unknown serialization item '%s', ignoring.", l); + } + } + + return manager_deserialize_units(m, f, fds); +} diff --git a/src/core/manager-serialize.h b/src/core/manager-serialize.h new file mode 100644 index 0000000..c52261e --- /dev/null +++ b/src/core/manager-serialize.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "manager.h" +#include "fdset.h" + +#define DESTROY_IPC_FLAG (UINT32_C(1) << 31) + +int manager_open_serialization(Manager *m, FILE **ret_f); +int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root); +int manager_deserialize(Manager *m, FILE *f, FDSet *fds); diff --git a/src/core/manager.c b/src/core/manager.c new file mode 100644 index 0000000..88eebfc --- /dev/null +++ b/src/core/manager.c @@ -0,0 +1,5039 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_AUDIT +#include +#endif + +#include "sd-daemon.h" +#include "sd-messages.h" +#include "sd-path.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "audit-fd.h" +#include "boot-timestamps.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-util.h" +#include "clean-ipc.h" +#include "clock-util.h" +#include "common-signal.h" +#include "confidential-virt.h" +#include "constants.h" +#include "core-varlink.h" +#include "creds-util.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "dirent-util.h" +#include "env-util.h" +#include "escape.h" +#include "event-util.h" +#include "exec-util.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "generator-setup.h" +#include "hashmap.h" +#include "initrd-util.h" +#include "inotify-util.h" +#include "install.h" +#include "io-util.h" +#include "label-util.h" +#include "load-fragment.h" +#include "locale-setup.h" +#include "log.h" +#include "macro.h" +#include "manager.h" +#include "manager-dump.h" +#include "manager-serialize.h" +#include "memory-util.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "os-util.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "plymouth-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "psi-util.h" +#include "ratelimit.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "sysctl-util.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "time-util.h" +#include "transaction.h" +#include "uid-range.h" +#include "umask-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "virt.h" +#include "watchdog.h" + +#define NOTIFY_RCVBUF_SIZE (8*1024*1024) +#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) + +/* Initial delay and the interval for printing status messages about running jobs */ +#define JOBS_IN_PROGRESS_WAIT_USEC (2*USEC_PER_SEC) +#define JOBS_IN_PROGRESS_QUIET_WAIT_USEC (25*USEC_PER_SEC) +#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3) +#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 + +/* If there are more than 1K bus messages queue across our API and direct buses, then let's not add more on top until + * the queue gets more empty. */ +#define MANAGER_BUS_BUSY_THRESHOLD 1024LU + +/* How many units and jobs to process of the bus queue before returning to the event loop. */ +#define MANAGER_BUS_MESSAGE_BUDGET 100U + +#define DEFAULT_TASKS_MAX ((CGroupTasksMax) { 15U, 100U }) /* 15% */ + +static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); +static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); +static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); +static int manager_dispatch_timezone_change(sd_event_source *source, const struct inotify_event *event, void *userdata); +static int manager_run_environment_generators(Manager *m); +static int manager_run_generators(Manager *m); +static void manager_vacuum(Manager *m); + +static usec_t manager_watch_jobs_next_time(Manager *m) { + usec_t timeout; + + if (MANAGER_IS_USER(m)) + /* Let the user manager without a timeout show status quickly, so the system manager can make + * use of it, if it wants to. */ + timeout = JOBS_IN_PROGRESS_WAIT_USEC * 2 / 3; + else if (show_status_on(m->show_status)) + /* When status is on, just use the usual timeout. */ + timeout = JOBS_IN_PROGRESS_WAIT_USEC; + else + timeout = JOBS_IN_PROGRESS_QUIET_WAIT_USEC; + + return usec_add(now(CLOCK_MONOTONIC), timeout); +} + +static bool manager_is_confirm_spawn_disabled(Manager *m) { + assert(m); + + if (!m->confirm_spawn) + return true; + + return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0; +} + +static void manager_watch_jobs_in_progress(Manager *m) { + usec_t next; + int r; + + assert(m); + + /* We do not want to show the cylon animation if the user + * needs to confirm service executions otherwise confirmation + * messages will be screwed by the cylon animation. */ + if (!manager_is_confirm_spawn_disabled(m)) + return; + + if (m->jobs_in_progress_event_source) + return; + + next = manager_watch_jobs_next_time(m); + r = sd_event_add_time( + m->event, + &m->jobs_in_progress_event_source, + CLOCK_MONOTONIC, + next, 0, + manager_dispatch_jobs_in_progress, m); + if (r < 0) + return; + + (void) sd_event_source_set_description(m->jobs_in_progress_event_source, "manager-jobs-in-progress"); +} + +static void manager_flip_auto_status(Manager *m, bool enable, const char *reason) { + assert(m); + + if (enable) { + if (m->show_status == SHOW_STATUS_AUTO) + manager_set_show_status(m, SHOW_STATUS_TEMPORARY, reason); + } else { + if (m->show_status == SHOW_STATUS_TEMPORARY) + manager_set_show_status(m, SHOW_STATUS_AUTO, reason); + } +} + +static void manager_print_jobs_in_progress(Manager *m) { + Job *j; + unsigned counter = 0, print_nr; + char cylon[6 + CYLON_BUFFER_EXTRA + 1]; + unsigned cylon_pos; + uint64_t timeout = 0; + + assert(m); + assert(m->n_running_jobs > 0); + + manager_flip_auto_status(m, true, "delay"); + + print_nr = (m->jobs_in_progress_iteration / JOBS_IN_PROGRESS_PERIOD_DIVISOR) % m->n_running_jobs; + + HASHMAP_FOREACH(j, m->jobs) + if (j->state == JOB_RUNNING && counter++ == print_nr) + break; + + /* m->n_running_jobs must be consistent with the contents of m->jobs, + * so the above loop must have succeeded in finding j. */ + assert(counter == print_nr + 1); + assert(j); + + cylon_pos = m->jobs_in_progress_iteration % 14; + if (cylon_pos >= 8) + cylon_pos = 14 - cylon_pos; + draw_cylon(cylon, sizeof(cylon), 6, cylon_pos); + + m->jobs_in_progress_iteration++; + + char job_of_n[STRLEN("( of ) ") + DECIMAL_STR_MAX(unsigned)*2] = ""; + if (m->n_running_jobs > 1) + xsprintf(job_of_n, "(%u of %u) ", counter, m->n_running_jobs); + + (void) job_get_timeout(j, &timeout); + + /* We want to use enough information for the user to identify previous lines talking about the same + * unit, but keep the message as short as possible. So if 'Starting foo.service' or 'Starting + * foo.service - Description' were used, 'foo.service' is enough here. On the other hand, if we used + * 'Starting Description' before, then we shall also use 'Description' here. So we pass NULL as the + * second argument to unit_status_string(). */ + const char *ident = unit_status_string(j->unit, NULL); + + const char *time = FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - j->begin_usec, 1*USEC_PER_SEC); + const char *limit = timeout > 0 ? FORMAT_TIMESPAN(timeout - j->begin_usec, 1*USEC_PER_SEC) : "no limit"; + + if (m->status_unit_format == STATUS_UNIT_FORMAT_DESCRIPTION) + /* When using 'Description', we effectively don't have enough space to show the nested status + * without ellipsization, so let's not even try. */ + manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon, + "%sA %s job is running for %s (%s / %s)", + job_of_n, + job_type_to_string(j->type), + ident, + time, limit); + else { + const char *status_text = unit_status_text(j->unit); + + manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon, + "%sJob %s/%s running (%s / %s)%s%s", + job_of_n, + ident, + job_type_to_string(j->type), + time, limit, + status_text ? ": " : "", + strempty(status_text)); + } + + sd_notifyf(false, + "STATUS=%sUser job %s/%s running (%s / %s)...", + job_of_n, + ident, + job_type_to_string(j->type), + time, limit); + m->status_ready = false; +} + +static int have_ask_password(void) { + _cleanup_closedir_ DIR *dir = NULL; + + dir = opendir("/run/systemd/ask-password"); + if (!dir) { + if (errno == ENOENT) + return false; + else + return -errno; + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) + if (startswith(de->d_name, "ask.")) + return true; + return false; +} + +static int manager_dispatch_ask_password_fd(sd_event_source *source, + int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + (void) flush_fd(fd); + + m->have_ask_password = have_ask_password(); + if (m->have_ask_password < 0) + /* Log error but continue. Negative have_ask_password + * is treated as unknown status. */ + log_error_errno(m->have_ask_password, "Failed to list /run/systemd/ask-password: %m"); + + return 0; +} + +static void manager_close_ask_password(Manager *m) { + assert(m); + + m->ask_password_event_source = sd_event_source_disable_unref(m->ask_password_event_source); + m->ask_password_inotify_fd = safe_close(m->ask_password_inotify_fd); + m->have_ask_password = -EINVAL; +} + +static int manager_check_ask_password(Manager *m) { + int r; + + assert(m); + + if (!m->ask_password_event_source) { + assert(m->ask_password_inotify_fd < 0); + + (void) mkdir_p_label("/run/systemd/ask-password", 0755); + + m->ask_password_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->ask_password_inotify_fd < 0) + return log_error_errno(errno, "Failed to create inotify object: %m"); + + r = inotify_add_watch_and_warn(m->ask_password_inotify_fd, + "/run/systemd/ask-password", + IN_CREATE|IN_DELETE|IN_MOVE); + if (r < 0) { + manager_close_ask_password(m); + return r; + } + + r = sd_event_add_io(m->event, &m->ask_password_event_source, + m->ask_password_inotify_fd, EPOLLIN, + manager_dispatch_ask_password_fd, m); + if (r < 0) { + log_error_errno(r, "Failed to add event source for /run/systemd/ask-password: %m"); + manager_close_ask_password(m); + return r; + } + + (void) sd_event_source_set_description(m->ask_password_event_source, "manager-ask-password"); + + /* Queries might have been added meanwhile... */ + manager_dispatch_ask_password_fd(m->ask_password_event_source, + m->ask_password_inotify_fd, EPOLLIN, m); + } + + return m->have_ask_password; +} + +static int manager_watch_idle_pipe(Manager *m) { + int r; + + assert(m); + + if (m->idle_pipe_event_source) + return 0; + + if (m->idle_pipe[2] < 0) + return 0; + + r = sd_event_add_io(m->event, &m->idle_pipe_event_source, m->idle_pipe[2], EPOLLIN, manager_dispatch_idle_pipe_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to watch idle pipe: %m"); + + (void) sd_event_source_set_description(m->idle_pipe_event_source, "manager-idle-pipe"); + + return 0; +} + +static void manager_close_idle_pipe(Manager *m) { + assert(m); + + m->idle_pipe_event_source = sd_event_source_disable_unref(m->idle_pipe_event_source); + + safe_close_pair(m->idle_pipe); + safe_close_pair(m->idle_pipe + 2); +} + +static int manager_setup_time_change(Manager *m) { + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + m->time_change_event_source = sd_event_source_disable_unref(m->time_change_event_source); + + r = event_add_time_change(m->event, &m->time_change_event_source, manager_dispatch_time_change_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to create time change event source: %m"); + + /* Schedule this slightly earlier than the .timer event sources */ + r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1); + if (r < 0) + return log_error_errno(r, "Failed to set priority of time change event sources: %m"); + + log_debug("Set up TFD_TIMER_CANCEL_ON_SET timerfd."); + + return 0; +} + +static int manager_read_timezone_stat(Manager *m) { + struct stat st; + bool changed; + + assert(m); + + /* Read the current stat() data of /etc/localtime so that we detect changes */ + if (lstat("/etc/localtime", &st) < 0) { + log_debug_errno(errno, "Failed to stat /etc/localtime, ignoring: %m"); + changed = m->etc_localtime_accessible; + m->etc_localtime_accessible = false; + } else { + usec_t k; + + k = timespec_load(&st.st_mtim); + changed = !m->etc_localtime_accessible || k != m->etc_localtime_mtime; + + m->etc_localtime_mtime = k; + m->etc_localtime_accessible = true; + } + + return changed; +} + +static int manager_setup_timezone_change(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *new_event = NULL; + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + /* We watch /etc/localtime for three events: change of the link count (which might mean removal from /etc even + * though another link might be kept), renames, and file close operations after writing. Note we don't bother + * with IN_DELETE_SELF, as that would just report when the inode is removed entirely, i.e. after the link count + * went to zero and all fds to it are closed. + * + * Note that we never follow symlinks here. This is a simplification, but should cover almost all cases + * correctly. + * + * Note that we create the new event source first here, before releasing the old one. This should optimize + * behaviour as this way sd-event can reuse the old watch in case the inode didn't change. */ + + r = sd_event_add_inotify(m->event, &new_event, "/etc/localtime", + IN_ATTRIB|IN_MOVE_SELF|IN_CLOSE_WRITE|IN_DONT_FOLLOW, manager_dispatch_timezone_change, m); + if (r == -ENOENT) { + /* If the file doesn't exist yet, subscribe to /etc instead, and wait until it is created either by + * O_CREATE or by rename() */ + + log_debug_errno(r, "/etc/localtime doesn't exist yet, watching /etc instead."); + r = sd_event_add_inotify(m->event, &new_event, "/etc", + IN_CREATE|IN_MOVED_TO|IN_ONLYDIR, manager_dispatch_timezone_change, m); + } + if (r < 0) + return log_error_errno(r, "Failed to create timezone change event source: %m"); + + /* Schedule this slightly earlier than the .timer event sources */ + r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1); + if (r < 0) + return log_error_errno(r, "Failed to set priority of timezone change event sources: %m"); + + sd_event_source_unref(m->timezone_change_event_source); + m->timezone_change_event_source = TAKE_PTR(new_event); + + return 0; +} + +static int enable_special_signals(Manager *m) { + _cleanup_close_ int fd = -EBADF; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + /* Enable that we get SIGINT on control-alt-del. In containers + * this will fail with EPERM (older) or EINVAL (newer), so + * ignore that. */ + if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL)) + log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m"); + + fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) { + /* Support systems without virtual console */ + if (fd != -ENOENT) + log_warning_errno(errno, "Failed to open /dev/tty0: %m"); + } else { + /* Enable that we get SIGWINCH on kbrequest */ + if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0) + log_warning_errno(errno, "Failed to enable kbrequest handling: %m"); + } + + return 0; +} + +#define RTSIG_IF_AVAILABLE(signum) (signum <= SIGRTMAX ? signum : -1) + +static int manager_setup_signals(Manager *m) { + struct sigaction sa = { + .sa_handler = SIG_DFL, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + sigset_t mask; + int r; + + assert(m); + + assert_se(sigaction(SIGCHLD, &sa, NULL) == 0); + + /* We make liberal use of realtime signals here. On + * Linux/glibc we have 30 of them (with the exception of Linux + * on hppa, see below), between SIGRTMIN+0 ... SIGRTMIN+30 + * (aka SIGRTMAX). */ + + assert_se(sigemptyset(&mask) == 0); + sigset_add_many(&mask, + SIGCHLD, /* Child died */ + SIGTERM, /* Reexecute daemon */ + SIGHUP, /* Reload configuration */ + SIGUSR1, /* systemd: reconnect to D-Bus */ + SIGUSR2, /* systemd: dump status */ + SIGINT, /* Kernel sends us this on control-alt-del */ + SIGWINCH, /* Kernel sends us this on kbrequest (alt-arrowup) */ + SIGPWR, /* Some kernel drivers and upsd send us this on power failure */ + + SIGRTMIN+0, /* systemd: start default.target */ + SIGRTMIN+1, /* systemd: isolate rescue.target */ + SIGRTMIN+2, /* systemd: isolate emergency.target */ + SIGRTMIN+3, /* systemd: start halt.target */ + SIGRTMIN+4, /* systemd: start poweroff.target */ + SIGRTMIN+5, /* systemd: start reboot.target */ + SIGRTMIN+6, /* systemd: start kexec.target */ + SIGRTMIN+7, /* systemd: start soft-reboot.target */ + + /* ... space for more special targets ... */ + + SIGRTMIN+13, /* systemd: Immediate halt */ + SIGRTMIN+14, /* systemd: Immediate poweroff */ + SIGRTMIN+15, /* systemd: Immediate reboot */ + SIGRTMIN+16, /* systemd: Immediate kexec */ + SIGRTMIN+17, /* systemd: Immediate soft-reboot */ + SIGRTMIN+18, /* systemd: control command */ + + /* ... space ... */ + + SIGRTMIN+20, /* systemd: enable status messages */ + SIGRTMIN+21, /* systemd: disable status messages */ + SIGRTMIN+22, /* systemd: set log level to LOG_DEBUG */ + SIGRTMIN+23, /* systemd: set log level to LOG_INFO */ + SIGRTMIN+24, /* systemd: Immediate exit (--user only) */ + SIGRTMIN+25, /* systemd: reexecute manager */ + + /* Apparently Linux on hppa had fewer RT signals until v3.18, + * SIGRTMAX was SIGRTMIN+25, and then SIGRTMIN was lowered, + * see commit v3.17-7614-g1f25df2eff. + * + * We cannot unconditionally make use of those signals here, + * so let's use a runtime check. Since these commands are + * accessible by different means and only really a safety + * net, the missing functionality on hppa shouldn't matter. + */ + + RTSIG_IF_AVAILABLE(SIGRTMIN+26), /* systemd: set log target to journal-or-kmsg */ + RTSIG_IF_AVAILABLE(SIGRTMIN+27), /* systemd: set log target to console */ + RTSIG_IF_AVAILABLE(SIGRTMIN+28), /* systemd: set log target to kmsg */ + RTSIG_IF_AVAILABLE(SIGRTMIN+29), /* systemd: set log target to syslog-or-kmsg (obsolete) */ + + /* ... one free signal here SIGRTMIN+30 ... */ + -1); + assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + + m->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC); + if (m->signal_fd < 0) + return -errno; + + r = sd_event_add_io(m->event, &m->signal_event_source, m->signal_fd, EPOLLIN, manager_dispatch_signal_fd, m); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->signal_event_source, "manager-signal"); + + /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the + * notify processing can still figure out to which process/service a message belongs, before we reap the + * process. Also, process this before handling cgroup notifications, so that we always collect child exit + * status information before detecting that there's no process in a cgroup. */ + r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(m)) + return enable_special_signals(m); + + return 0; +} + +static char** sanitize_environment(char **l) { + + /* Let's remove some environment variables that we need ourselves to communicate with our clients */ + strv_env_unset_many( + l, + "CACHE_DIRECTORY", + "CONFIGURATION_DIRECTORY", + "CREDENTIALS_DIRECTORY", + "EXIT_CODE", + "EXIT_STATUS", + "INVOCATION_ID", + "JOURNAL_STREAM", + "LISTEN_FDNAMES", + "LISTEN_FDS", + "LISTEN_PID", + "LOGS_DIRECTORY", + "LOG_NAMESPACE", + "MAINPID", + "MANAGERPID", + "MEMORY_PRESSURE_WATCH", + "MEMORY_PRESSURE_WRITE", + "MONITOR_EXIT_CODE", + "MONITOR_EXIT_STATUS", + "MONITOR_INVOCATION_ID", + "MONITOR_SERVICE_RESULT", + "MONITOR_UNIT", + "NOTIFY_SOCKET", + "PIDFILE", + "REMOTE_ADDR", + "REMOTE_PORT", + "RUNTIME_DIRECTORY", + "SERVICE_RESULT", + "STATE_DIRECTORY", + "SYSTEMD_EXEC_PID", + "TRIGGER_PATH", + "TRIGGER_TIMER_MONOTONIC_USEC", + "TRIGGER_TIMER_REALTIME_USEC", + "TRIGGER_UNIT", + "WATCHDOG_PID", + "WATCHDOG_USEC", + NULL); + + /* Let's order the environment alphabetically, just to make it pretty */ + return strv_sort(l); +} + +int manager_default_environment(Manager *m) { + int r; + + assert(m); + + m->transient_environment = strv_free(m->transient_environment); + + if (MANAGER_IS_SYSTEM(m)) { + /* The system manager always starts with a clean environment for its children. It does not + * import the kernel's or the parents' exported variables. + * + * The initial passed environment is untouched to keep /proc/self/environ valid; it is used + * for tagging the init process inside containers. */ + m->transient_environment = strv_new("PATH=" DEFAULT_PATH); + if (!m->transient_environment) + return log_oom(); + + /* Import locale variables LC_*= from configuration */ + (void) locale_setup(&m->transient_environment); + } else { + /* The user manager passes its own environment along to its children, except for $PATH. */ + m->transient_environment = strv_copy(environ); + if (!m->transient_environment) + return log_oom(); + + r = strv_env_replace_strdup(&m->transient_environment, "PATH=" DEFAULT_USER_PATH); + if (r < 0) + return log_oom(); + } + + sanitize_environment(m->transient_environment); + return 0; +} + +static int manager_setup_prefix(Manager *m) { + struct table_entry { + uint64_t type; + const char *suffix; + }; + + static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL }, + [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL }, + }; + + static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" }, + [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL }, + }; + + assert(m); + + const struct table_entry *p = MANAGER_IS_SYSTEM(m) ? paths_system : paths_user; + int r; + + for (ExecDirectoryType i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) { + r = sd_path_lookup(p[i].type, p[i].suffix, &m->prefix[i]); + if (r < 0) + return log_warning_errno(r, "Failed to lookup %s path: %m", + exec_directory_type_to_string(i)); + } + + return 0; +} + +static void manager_free_unit_name_maps(Manager *m) { + m->unit_id_map = hashmap_free(m->unit_id_map); + m->unit_name_map = hashmap_free(m->unit_name_map); + m->unit_path_cache = set_free(m->unit_path_cache); + m->unit_cache_timestamp_hash = 0; +} + +static int manager_setup_run_queue(Manager *m) { + int r; + + assert(m); + assert(!m->run_queue_event_source); + + r = sd_event_add_defer(m->event, &m->run_queue_event_source, manager_dispatch_run_queue, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(m->run_queue_event_source, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->run_queue_event_source, "manager-run-queue"); + + return 0; +} + +static int manager_setup_sigchld_event_source(Manager *m) { + int r; + + assert(m); + assert(!m->sigchld_event_source); + + r = sd_event_add_defer(m->event, &m->sigchld_event_source, manager_dispatch_sigchld, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->sigchld_event_source, "manager-sigchld"); + + return 0; +} + +int manager_setup_memory_pressure_event_source(Manager *m) { + int r; + + assert(m); + + m->memory_pressure_event_source = sd_event_source_disable_unref(m->memory_pressure_event_source); + + r = sd_event_add_memory_pressure(m->event, &m->memory_pressure_event_source, NULL, NULL); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r, + "Failed to establish memory pressure event source, ignoring: %m"); + else if (m->defaults.memory_pressure_threshold_usec != USEC_INFINITY) { + + /* If there's a default memory pressure threshold set, also apply it to the service manager itself */ + r = sd_event_source_set_memory_pressure_period( + m->memory_pressure_event_source, + m->defaults.memory_pressure_threshold_usec, + MEMORY_PRESSURE_DEFAULT_WINDOW_USEC); + if (r < 0) + log_warning_errno(r, "Failed to adjust memory pressure threshold, ignoring: %m"); + } + + return 0; +} + +static int manager_find_credentials_dirs(Manager *m) { + const char *e; + int r; + + assert(m); + + r = get_credentials_dir(&e); + if (r < 0) { + if (r != -ENXIO) + log_debug_errno(r, "Failed to determine credentials directory, ignoring: %m"); + } else { + m->received_credentials_directory = strdup(e); + if (!m->received_credentials_directory) + return -ENOMEM; + } + + r = get_encrypted_credentials_dir(&e); + if (r < 0) { + if (r != -ENXIO) + log_debug_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m"); + } else { + m->received_encrypted_credentials_directory = strdup(e); + if (!m->received_encrypted_credentials_directory) + return -ENOMEM; + } + + return 0; +} + +void manager_set_switching_root(Manager *m, bool switching_root) { + assert(m); + + m->switching_root = MANAGER_IS_SYSTEM(m) && switching_root; +} + +double manager_get_progress(Manager *m) { + assert(m); + + if (MANAGER_IS_FINISHED(m) || m->n_installed_jobs == 0) + return 1.0; + + return 1.0 - ((double) hashmap_size(m->jobs) / (double) m->n_installed_jobs); +} + +static int compare_job_priority(const void *a, const void *b) { + const Job *x = a, *y = b; + + return unit_compare_priority(x->unit, y->unit); +} + +int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(IN_SET(runtime_scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER)); + assert(ret); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .runtime_scope = runtime_scope, + .objective = _MANAGER_OBJECTIVE_INVALID, + + .status_unit_format = STATUS_UNIT_FORMAT_DEFAULT, + + .original_log_level = -1, + .original_log_target = _LOG_TARGET_INVALID, + + .watchdog_overridden[WATCHDOG_RUNTIME] = USEC_INFINITY, + .watchdog_overridden[WATCHDOG_REBOOT] = USEC_INFINITY, + .watchdog_overridden[WATCHDOG_KEXEC] = USEC_INFINITY, + .watchdog_overridden[WATCHDOG_PRETIMEOUT] = USEC_INFINITY, + + .show_status_overridden = _SHOW_STATUS_INVALID, + + .notify_fd = -EBADF, + .cgroups_agent_fd = -EBADF, + .signal_fd = -EBADF, + .user_lookup_fds = EBADF_PAIR, + .private_listen_fd = -EBADF, + .dev_autofs_fd = -EBADF, + .cgroup_inotify_fd = -EBADF, + .pin_cgroupfs_fd = -EBADF, + .ask_password_inotify_fd = -EBADF, + .idle_pipe = { -EBADF, -EBADF, -EBADF, -EBADF}, + + /* start as id #1, so that we can leave #0 around as "null-like" value */ + .current_job_id = 1, + + .have_ask_password = -EINVAL, /* we don't know */ + .first_boot = -1, + .test_run_flags = test_run_flags, + + .dump_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_MINUTE, .burst = 10 }, + + .executor_fd = -EBADF, + }; + + unit_defaults_init(&m->defaults, runtime_scope); + +#if ENABLE_EFI + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) + boot_timestamps(m->timestamps + MANAGER_TIMESTAMP_USERSPACE, + m->timestamps + MANAGER_TIMESTAMP_FIRMWARE, + m->timestamps + MANAGER_TIMESTAMP_LOADER); +#endif + + /* Prepare log fields we can use for structured logging */ + if (MANAGER_IS_SYSTEM(m)) { + m->unit_log_field = "UNIT="; + m->unit_log_format_string = "UNIT=%s"; + + m->invocation_log_field = "INVOCATION_ID="; + m->invocation_log_format_string = "INVOCATION_ID=%s"; + } else { + m->unit_log_field = "USER_UNIT="; + m->unit_log_format_string = "USER_UNIT=%s"; + + m->invocation_log_field = "USER_INVOCATION_ID="; + m->invocation_log_format_string = "USER_INVOCATION_ID=%s"; + } + + /* Reboot immediately if the user hits C-A-D more often than 7x per 2s */ + m->ctrl_alt_del_ratelimit = (const RateLimit) { .interval = 2 * USEC_PER_SEC, .burst = 7 }; + + r = manager_default_environment(m); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->units, &string_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->cgroup_unit, &path_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->watch_bus, &string_hash_ops); + if (r < 0) + return r; + + r = prioq_ensure_allocated(&m->run_queue, compare_job_priority); + if (r < 0) + return r; + + r = manager_setup_prefix(m); + if (r < 0) + return r; + + r = manager_find_credentials_dirs(m); + if (r < 0) + return r; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + r = manager_setup_run_queue(m); + if (r < 0) + return r; + + if (FLAGS_SET(test_run_flags, MANAGER_TEST_RUN_MINIMAL)) { + m->cgroup_root = strdup(""); + if (!m->cgroup_root) + return -ENOMEM; + } else { + r = manager_setup_signals(m); + if (r < 0) + return r; + + r = manager_setup_cgroup(m); + if (r < 0) + return r; + + r = manager_setup_time_change(m); + if (r < 0) + return r; + + r = manager_read_timezone_stat(m); + if (r < 0) + return r; + + (void) manager_setup_timezone_change(m); + + r = manager_setup_sigchld_event_source(m); + if (r < 0) + return r; + + r = manager_setup_memory_pressure_event_source(m); + if (r < 0) + return r; + +#if HAVE_LIBBPF + if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported(/* initialize = */ true)) { + r = lsm_bpf_setup(m); + if (r < 0) + log_warning_errno(r, "Failed to setup LSM BPF, ignoring: %m"); + } +#endif + } + + if (test_run_flags == 0) { + if (MANAGER_IS_SYSTEM(m)) + r = mkdir_label("/run/systemd/units", 0755); + else { + _cleanup_free_ char *units_path = NULL; + r = xdg_user_runtime_dir(&units_path, "/systemd/units"); + if (r < 0) + return r; + r = mkdir_p_label(units_path, 0755); + } + + if (r < 0 && r != -EEXIST) + return r; + + m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH); + if (m->executor_fd < 0) + return log_emergency_errno(errno, + "Failed to open executor binary '%s': %m", + SYSTEMD_EXECUTOR_BINARY_PATH); + } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) { + _cleanup_free_ char *self_exe = NULL, *executor_path = NULL; + _cleanup_close_ int self_dir_fd = -EBADF; + int level = LOG_DEBUG; + + /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the + * build directory. Fallback to working directory and then the installation path. */ + r = readlink_and_make_absolute("/proc/self/exe", &self_exe); + if (r < 0) + return r; + + self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_PATH|O_DIRECTORY, 0); + if (self_dir_fd < 0) + return self_dir_fd; + + m->executor_fd = RET_NERRNO(openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH)); + if (m->executor_fd == -ENOENT) + m->executor_fd = RET_NERRNO(openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH)); + if (m->executor_fd == -ENOENT) { + m->executor_fd = RET_NERRNO(open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH)); + level = LOG_WARNING; /* Tests should normally use local builds */ + } + if (m->executor_fd < 0) + return m->executor_fd; + + r = fd_get_path(m->executor_fd, &executor_path); + if (r < 0) + return r; + + log_full(level, "Using systemd-executor binary from '%s'.", executor_path); + } + + /* Note that we do not set up the notify fd here. We do that after deserialization, + * since they might have gotten serialized across the reexec. */ + + *ret = TAKE_PTR(m); + + return 0; +} + +static int manager_setup_notify(Manager *m) { + int r; + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + if (m->notify_fd < 0) { + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + socklen_t sa_len; + + /* First free all secondary fields */ + m->notify_socket = mfree(m->notify_socket); + m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate notification socket: %m"); + + fd_increase_rxbuf(fd, NOTIFY_RCVBUF_SIZE); + + m->notify_socket = path_join(m->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/notify"); + if (!m->notify_socket) + return log_oom(); + + r = sockaddr_un_set_path(&sa.un, m->notify_socket); + if (r < 0) + return log_error_errno(r, "Notify socket '%s' not valid for AF_UNIX socket address, refusing.", + m->notify_socket); + sa_len = r; + + (void) mkdir_parents_label(m->notify_socket, 0755); + (void) sockaddr_un_unlink(&sa.un); + + r = mac_selinux_bind(fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(r, "bind(%s) failed: %m", m->notify_socket); + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + m->notify_fd = TAKE_FD(fd); + + log_debug("Using notification socket %s", m->notify_socket); + } + + if (!m->notify_event_source) { + r = sd_event_add_io(m->event, &m->notify_event_source, m->notify_fd, EPOLLIN, manager_dispatch_notify_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + + /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which + * service an exit message belongs. */ + r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) + return log_error_errno(r, "Failed to set priority of notify event source: %m"); + + (void) sd_event_source_set_description(m->notify_event_source, "manager-notify"); + } + + return 0; +} + +static int manager_setup_cgroups_agent(Manager *m) { + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/cgroups-agent", + }; + int r; + + /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering + * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and + * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on + * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number + * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, + * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX + * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and + * we thus won't lose messages. + * + * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen + * to it. The system instance hence listens on this special socket, but the user instances listen on the system + * bus for these messages. */ + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether unified cgroups hierarchy is used: %m"); + if (r > 0) /* We don't need this anymore on the unified hierarchy */ + return 0; + + if (m->cgroups_agent_fd < 0) { + _cleanup_close_ int fd = -EBADF; + + /* First free all secondary fields */ + m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); + + fd_increase_rxbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); + + (void) sockaddr_un_unlink(&sa.un); + + /* Only allow root to connect to this socket */ + WITH_UMASK(0077) + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + m->cgroups_agent_fd = TAKE_FD(fd); + } + + if (!m->cgroups_agent_event_source) { + r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); + + /* Process cgroups notifications early. Note that when the agent notification is received + * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than + * that. Also see handling of cgroup inotify for the unified cgroup stuff. */ + r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); + + (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); + } + + return 0; +} + +static int manager_setup_user_lookup_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID + * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation, + * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked + * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects + * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes, + * hence we establish this communication channel so that forked off processes can pass their UID/GID + * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple + * datagram, along with their unit name, so that we can share one communication socket pair among all units for + * this purpose. + * + * You might wonder why we need a communication channel for this that is independent of the usual notification + * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET + * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user + * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available. + * + * Note that this function is called under two circumstances: when we first initialize (in which case we + * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload + * (in which case the socket pair already exists but we still need to allocate the event source for it). */ + + if (m->user_lookup_fds[0] < 0) { + + /* Free all secondary fields */ + safe_close_pair(m->user_lookup_fds); + m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0) + return log_error_errno(errno, "Failed to allocate user lookup socket: %m"); + + (void) fd_increase_rxbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE); + } + + if (!m->user_lookup_event_source) { + r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m); + if (r < 0) + return log_error_errno(errno, "Failed to allocate user lookup event source: %m"); + + /* Process even earlier than the notify event source, so that we always know first about valid UID/GID + * resolutions */ + r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11); + if (r < 0) + return log_error_errno(errno, "Failed to set priority of user lookup event source: %m"); + + (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup"); + } + + return 0; +} + +static unsigned manager_dispatch_cleanup_queue(Manager *m) { + Unit *u; + unsigned n = 0; + + assert(m); + + while ((u = m->cleanup_queue)) { + assert(u->in_cleanup_queue); + + unit_free(u); + n++; + } + + return n; +} + +static unsigned manager_dispatch_release_resources_queue(Manager *m) { + unsigned n = 0; + Unit *u; + + assert(m); + + while ((u = LIST_POP(release_resources_queue, m->release_resources_queue))) { + assert(u->in_release_resources_queue); + u->in_release_resources_queue = false; + + n++; + + unit_release_resources(u); + } + + return n; +} + +enum { + GC_OFFSET_IN_PATH, /* This one is on the path we were traveling */ + GC_OFFSET_UNSURE, /* No clue */ + GC_OFFSET_GOOD, /* We still need this unit */ + GC_OFFSET_BAD, /* We don't need this unit anymore */ + _GC_OFFSET_MAX +}; + +static void unit_gc_mark_good(Unit *u, unsigned gc_marker) { + Unit *other; + + u->gc_marker = gc_marker + GC_OFFSET_GOOD; + + /* Recursively mark referenced units as GOOD as well */ + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCES) + if (other->gc_marker == gc_marker + GC_OFFSET_UNSURE) + unit_gc_mark_good(other, gc_marker); +} + +static void unit_gc_sweep(Unit *u, unsigned gc_marker) { + Unit *other; + bool is_bad; + + assert(u); + + if (IN_SET(u->gc_marker - gc_marker, + GC_OFFSET_GOOD, GC_OFFSET_BAD, GC_OFFSET_UNSURE, GC_OFFSET_IN_PATH)) + return; + + if (u->in_cleanup_queue) + goto bad; + + if (!unit_may_gc(u)) + goto good; + + u->gc_marker = gc_marker + GC_OFFSET_IN_PATH; + + is_bad = true; + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCED_BY) { + unit_gc_sweep(other, gc_marker); + + if (other->gc_marker == gc_marker + GC_OFFSET_GOOD) + goto good; + + if (other->gc_marker != gc_marker + GC_OFFSET_BAD) + is_bad = false; + } + + LIST_FOREACH(refs_by_target, ref, u->refs_by_target) { + unit_gc_sweep(ref->source, gc_marker); + + if (ref->source->gc_marker == gc_marker + GC_OFFSET_GOOD) + goto good; + + if (ref->source->gc_marker != gc_marker + GC_OFFSET_BAD) + is_bad = false; + } + + if (is_bad) + goto bad; + + /* We were unable to find anything out about this entry, so + * let's investigate it later */ + u->gc_marker = gc_marker + GC_OFFSET_UNSURE; + unit_add_to_gc_queue(u); + return; + +bad: + /* We definitely know that this one is not useful anymore, so + * let's mark it for deletion */ + u->gc_marker = gc_marker + GC_OFFSET_BAD; + unit_add_to_cleanup_queue(u); + return; + +good: + unit_gc_mark_good(u, gc_marker); +} + +static unsigned manager_dispatch_gc_unit_queue(Manager *m) { + unsigned n = 0, gc_marker; + Unit *u; + + assert(m); + + /* log_debug("Running GC..."); */ + + m->gc_marker += _GC_OFFSET_MAX; + if (m->gc_marker + _GC_OFFSET_MAX <= _GC_OFFSET_MAX) + m->gc_marker = 1; + + gc_marker = m->gc_marker; + + while ((u = LIST_POP(gc_queue, m->gc_unit_queue))) { + assert(u->in_gc_queue); + + unit_gc_sweep(u, gc_marker); + + u->in_gc_queue = false; + + n++; + + if (IN_SET(u->gc_marker - gc_marker, + GC_OFFSET_BAD, GC_OFFSET_UNSURE)) { + if (u->id) + log_unit_debug(u, "Collecting."); + u->gc_marker = gc_marker + GC_OFFSET_BAD; + unit_add_to_cleanup_queue(u); + } + } + + return n; +} + +static unsigned manager_dispatch_gc_job_queue(Manager *m) { + unsigned n = 0; + Job *j; + + assert(m); + + while ((j = LIST_POP(gc_queue, m->gc_job_queue))) { + assert(j->in_gc_queue); + j->in_gc_queue = false; + + n++; + + if (!job_may_gc(j)) + continue; + + log_unit_debug(j->unit, "Collecting job."); + (void) job_finish_and_invalidate(j, JOB_COLLECTED, false, false); + } + + return n; +} + +static int manager_ratelimit_requeue(sd_event_source *s, uint64_t usec, void *userdata) { + Unit *u = userdata; + + assert(u); + assert(s == u->auto_start_stop_event_source); + + u->auto_start_stop_event_source = sd_event_source_unref(u->auto_start_stop_event_source); + + /* Re-queue to all queues, if the rate limit hit we might have been throttled on any of them. */ + unit_submit_to_stop_when_unneeded_queue(u); + unit_submit_to_start_when_upheld_queue(u); + unit_submit_to_stop_when_bound_queue(u); + + return 0; +} + +static int manager_ratelimit_check_and_queue(Unit *u) { + int r; + + assert(u); + + if (ratelimit_below(&u->auto_start_stop_ratelimit)) + return 1; + + /* Already queued, no need to requeue */ + if (u->auto_start_stop_event_source) + return 0; + + r = sd_event_add_time( + u->manager->event, + &u->auto_start_stop_event_source, + CLOCK_MONOTONIC, + ratelimit_end(&u->auto_start_stop_ratelimit), + 0, + manager_ratelimit_requeue, + u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to queue timer on event loop: %m"); + + return 0; +} + +static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) { + unsigned n = 0; + Unit *u; + int r; + + assert(m); + + while ((u = LIST_POP(stop_when_unneeded_queue, m->stop_when_unneeded_queue))) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + assert(u->in_stop_when_unneeded_queue); + u->in_stop_when_unneeded_queue = false; + + n++; + + if (!unit_is_unneeded(u)) + continue; + + log_unit_debug(u, "Unit is not needed anymore."); + + /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the + * service being unnecessary after a while. */ + + r = manager_ratelimit_check_and_queue(u); + if (r <= 0) { + log_unit_warning(u, + "Unit not needed anymore, but not stopping since we tried this too often recently.%s", + r == 0 ? " Will retry later." : ""); + continue; + } + + /* Ok, nobody needs us anymore. Sniff. Then let's commit suicide */ + r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r)); + } + + return n; +} + +static unsigned manager_dispatch_start_when_upheld_queue(Manager *m) { + unsigned n = 0; + Unit *u; + int r; + + assert(m); + + while ((u = LIST_POP(start_when_upheld_queue, m->start_when_upheld_queue))) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *culprit = NULL; + + assert(u->in_start_when_upheld_queue); + u->in_start_when_upheld_queue = false; + + n++; + + if (!unit_is_upheld_by_active(u, &culprit)) + continue; + + log_unit_debug(u, "Unit is started because upheld by active unit %s.", culprit->id); + + /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the + * service being unnecessary after a while. */ + + r = manager_ratelimit_check_and_queue(u); + if (r <= 0) { + log_unit_warning(u, + "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.%s", + culprit->id, + r == 0 ? " Will retry later." : ""); + continue; + } + + r = manager_add_job(u->manager, JOB_START, u, JOB_FAIL, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue start job, ignoring: %s", bus_error_message(&error, r)); + } + + return n; +} + +static unsigned manager_dispatch_stop_when_bound_queue(Manager *m) { + unsigned n = 0; + Unit *u; + int r; + + assert(m); + + while ((u = LIST_POP(stop_when_bound_queue, m->stop_when_bound_queue))) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *culprit = NULL; + + assert(u->in_stop_when_bound_queue); + u->in_stop_when_bound_queue = false; + + n++; + + if (!unit_is_bound_by_inactive(u, &culprit)) + continue; + + log_unit_debug(u, "Unit is stopped because bound to inactive unit %s.", culprit->id); + + /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the + * service being unnecessary after a while. */ + + r = manager_ratelimit_check_and_queue(u); + if (r <= 0) { + log_unit_warning(u, + "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.%s", + culprit->id, + r == 0 ? " Will retry later." : ""); + continue; + } + + r = manager_add_job(u->manager, JOB_STOP, u, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r)); + } + + return n; +} + +static void manager_clear_jobs_and_units(Manager *m) { + Unit *u; + + assert(m); + + while ((u = hashmap_first(m->units))) + unit_free(u); + + manager_dispatch_cleanup_queue(m); + + assert(!m->load_queue); + assert(prioq_isempty(m->run_queue)); + assert(!m->dbus_unit_queue); + assert(!m->dbus_job_queue); + assert(!m->cleanup_queue); + assert(!m->gc_unit_queue); + assert(!m->gc_job_queue); + assert(!m->cgroup_realize_queue); + assert(!m->cgroup_empty_queue); + assert(!m->cgroup_oom_queue); + assert(!m->target_deps_queue); + assert(!m->stop_when_unneeded_queue); + assert(!m->start_when_upheld_queue); + assert(!m->stop_when_bound_queue); + assert(!m->release_resources_queue); + + assert(hashmap_isempty(m->jobs)); + assert(hashmap_isempty(m->units)); + + m->n_on_console = 0; + m->n_running_jobs = 0; + m->n_installed_jobs = 0; + m->n_failed_jobs = 0; +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + manager_clear_jobs_and_units(m); + + for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) + if (unit_vtable[c]->shutdown) + unit_vtable[c]->shutdown(m); + + /* Keep the cgroup hierarchy in place except when we know we are going down for good */ + manager_shutdown_cgroup(m, /* delete= */ IN_SET(m->objective, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)); + + lookup_paths_flush_generator(&m->lookup_paths); + + bus_done(m); + manager_varlink_done(m); + + exec_shared_runtime_vacuum(m); + hashmap_free(m->exec_shared_runtime_by_id); + + dynamic_user_vacuum(m, false); + hashmap_free(m->dynamic_users); + + hashmap_free(m->units); + hashmap_free(m->units_by_invocation_id); + hashmap_free(m->jobs); + hashmap_free(m->watch_pids); + hashmap_free(m->watch_pids_more); + hashmap_free(m->watch_bus); + + prioq_free(m->run_queue); + + set_free(m->startup_units); + set_free(m->failed_units); + + sd_event_source_unref(m->signal_event_source); + sd_event_source_unref(m->sigchld_event_source); + sd_event_source_unref(m->notify_event_source); + sd_event_source_unref(m->cgroups_agent_event_source); + sd_event_source_unref(m->time_change_event_source); + sd_event_source_unref(m->timezone_change_event_source); + sd_event_source_unref(m->jobs_in_progress_event_source); + sd_event_source_unref(m->run_queue_event_source); + sd_event_source_unref(m->user_lookup_event_source); + sd_event_source_unref(m->memory_pressure_event_source); + + safe_close(m->signal_fd); + safe_close(m->notify_fd); + safe_close(m->cgroups_agent_fd); + safe_close_pair(m->user_lookup_fds); + + manager_close_ask_password(m); + + manager_close_idle_pipe(m); + + sd_event_unref(m->event); + + free(m->notify_socket); + + lookup_paths_free(&m->lookup_paths); + strv_free(m->transient_environment); + strv_free(m->client_environment); + + hashmap_free(m->cgroup_unit); + manager_free_unit_name_maps(m); + + free(m->switch_root); + free(m->switch_root_init); + + unit_defaults_done(&m->defaults); + + assert(hashmap_isempty(m->units_requiring_mounts_for)); + hashmap_free(m->units_requiring_mounts_for); + + hashmap_free(m->uid_refs); + hashmap_free(m->gid_refs); + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) + m->prefix[dt] = mfree(m->prefix[dt]); + free(m->received_credentials_directory); + free(m->received_encrypted_credentials_directory); + + free(m->watchdog_pretimeout_governor); + free(m->watchdog_pretimeout_governor_overridden); + + m->fw_ctx = fw_ctx_free(m->fw_ctx); + +#if BPF_FRAMEWORK + lsm_bpf_destroy(m->restrict_fs); +#endif + + safe_close(m->executor_fd); + + return mfree(m); +} + +static void manager_enumerate_perpetual(Manager *m) { + assert(m); + + if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL)) + return; + + /* Let's ask every type to load all units from disk/kernel that it might know */ + for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) { + if (!unit_type_supported(c)) { + log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c)); + continue; + } + + if (unit_vtable[c]->enumerate_perpetual) + unit_vtable[c]->enumerate_perpetual(m); + } +} + +static void manager_enumerate(Manager *m) { + assert(m); + + if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL)) + return; + + /* Let's ask every type to load all units from disk/kernel that it might know */ + for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) { + if (!unit_type_supported(c)) { + log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c)); + continue; + } + + if (unit_vtable[c]->enumerate) + unit_vtable[c]->enumerate(m); + } + + manager_dispatch_load_queue(m); +} + +static void manager_coldplug(Manager *m) { + Unit *u; + char *k; + int r; + + assert(m); + + log_debug("Invoking unit coldplug() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + /* Let's place the units back into their deserialized state */ + HASHMAP_FOREACH_KEY(u, k, m->units) { + + /* ignore aliases */ + if (u->id != k) + continue; + + r = unit_coldplug(u); + if (r < 0) + log_warning_errno(r, "We couldn't coldplug %s, proceeding anyway: %m", u->id); + } +} + +static void manager_catchup(Manager *m) { + Unit *u; + char *k; + + assert(m); + + log_debug("Invoking unit catchup() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + /* Let's catch up on any state changes that happened while we were reloading/reexecing */ + HASHMAP_FOREACH_KEY(u, k, m->units) { + + /* ignore aliases */ + if (u->id != k) + continue; + + unit_catchup(u); + } +} + +static void manager_distribute_fds(Manager *m, FDSet *fds) { + Unit *u; + + assert(m); + + HASHMAP_FOREACH(u, m->units) { + + if (fdset_size(fds) <= 0) + break; + + if (!UNIT_VTABLE(u)->distribute_fds) + continue; + + UNIT_VTABLE(u)->distribute_fds(u, fds); + } +} + +static bool manager_dbus_is_running(Manager *m, bool deserialized) { + Unit *u; + + assert(m); + + /* This checks whether the dbus instance we are supposed to expose our APIs on is up. We check both the socket + * and the service unit. If the 'deserialized' parameter is true we'll check the deserialized state of the unit + * rather than the current one. */ + + if (MANAGER_IS_TEST_RUN(m)) + return false; + + u = manager_get_unit(m, SPECIAL_DBUS_SOCKET); + if (!u) + return false; + if ((deserialized ? SOCKET(u)->deserialized_state : SOCKET(u)->state) != SOCKET_RUNNING) + return false; + + u = manager_get_unit(m, SPECIAL_DBUS_SERVICE); + if (!u) + return false; + if (!IN_SET((deserialized ? SERVICE(u)->deserialized_state : SERVICE(u)->state), + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_RELOAD_NOTIFY, + SERVICE_RELOAD_SIGNAL)) + return false; + + return true; +} + +static void manager_setup_bus(Manager *m) { + assert(m); + + /* Let's set up our private bus connection now, unconditionally */ + (void) bus_init_private(m); + + /* If we are in --user mode also connect to the system bus now */ + if (MANAGER_IS_USER(m)) + (void) bus_init_system(m); + + /* Let's connect to the bus now, but only if the unit is supposed to be up */ + if (manager_dbus_is_running(m, MANAGER_IS_RELOADING(m))) { + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } +} + +static void manager_preset_all(Manager *m) { + int r; + + assert(m); + + if (m->first_boot <= 0) + return; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (MANAGER_IS_TEST_RUN(m)) + return; + + /* If this is the first boot, and we are in the host system, then preset everything */ + UnitFilePresetMode mode = + ENABLE_FIRST_BOOT_FULL_PRESET ? UNIT_FILE_PRESET_FULL : UNIT_FILE_PRESET_ENABLE_ONLY; + + r = unit_file_preset_all(RUNTIME_SCOPE_SYSTEM, 0, NULL, mode, NULL, 0); + if (r < 0) + log_full_errno(r == -EEXIST ? LOG_NOTICE : LOG_WARNING, r, + "Failed to populate /etc with preset unit settings, ignoring: %m"); + else + log_info("Populated /etc with preset unit settings."); +} + +static void manager_ready(Manager *m) { + assert(m); + + /* After having loaded everything, do the final round of catching up with what might have changed */ + + m->objective = MANAGER_OK; /* Tell everyone we are up now */ + + /* It might be safe to log to the journal now and connect to dbus */ + manager_recheck_journal(m); + manager_recheck_dbus(m); + + /* Let's finally catch up with any changes that took place while we were reloading/reexecing */ + manager_catchup(m); + + /* Create a file which will indicate when the manager started loading units the last time. */ + if (MANAGER_IS_SYSTEM(m)) + (void) touch_file("/run/systemd/systemd-units-load", false, + m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].realtime ?: now(CLOCK_REALTIME), + UID_INVALID, GID_INVALID, 0444); +} + +Manager* manager_reloading_start(Manager *m) { + m->n_reloading++; + dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_UNITS_LOAD); + return m; +} + +void manager_reloading_stopp(Manager **m) { + if (*m) { + assert((*m)->n_reloading > 0); + (*m)->n_reloading--; + } +} + +int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root) { + int r; + + assert(m); + + /* If we are running in test mode, we still want to run the generators, + * but we should not touch the real generator directories. */ + r = lookup_paths_init_or_warn(&m->lookup_paths, m->runtime_scope, + MANAGER_IS_TEST_RUN(m) ? LOOKUP_PATHS_TEMPORARY_GENERATED : 0, + root); + if (r < 0) + return r; + + dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_START)); + r = manager_run_environment_generators(m); + if (r >= 0) + r = manager_run_generators(m); + dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_FINISH)); + if (r < 0) + return r; + + manager_preset_all(m); + + lookup_paths_log(&m->lookup_paths); + + { + /* This block is (optionally) done with the reloading counter bumped */ + _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL; + + /* Make sure we don't have a left-over from a previous run */ + if (!serialization) + (void) rm_rf(m->lookup_paths.transient, 0); + + /* If we will deserialize make sure that during enumeration this is already known, so we increase the + * counter here already */ + if (serialization) + reloading = manager_reloading_start(m); + + /* First, enumerate what we can from all config files */ + dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_START)); + manager_enumerate_perpetual(m); + manager_enumerate(m); + dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_FINISH)); + + /* Second, deserialize if there is something to deserialize */ + if (serialization) { + r = manager_deserialize(m, serialization, fds); + if (r < 0) + return log_error_errno(r, "Deserialization failed: %m"); + } + + /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass + * some file descriptors to us pre-initialized. This enables socket-based activation of entire + * containers. */ + manager_distribute_fds(m, fds); + + /* We might have deserialized the notify fd, but if we didn't then let's create the bus now */ + r = manager_setup_notify(m); + if (r < 0) + /* No sense to continue without notifications, our children would fail anyway. */ + return r; + + r = manager_setup_cgroups_agent(m); + if (r < 0) + /* Likewise, no sense to continue without empty cgroup notifications. */ + return r; + + r = manager_setup_user_lookup_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + + /* Connect to the bus if we are good for it */ + manager_setup_bus(m); + + /* Now that we are connected to all possible buses, let's deserialize who is tracking us. */ + r = bus_track_coldplug(m, &m->subscribed, false, m->deserialized_subscribed); + if (r < 0) + log_warning_errno(r, "Failed to deserialized tracked clients, ignoring: %m"); + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + + r = manager_varlink_init(m); + if (r < 0) + log_warning_errno(r, "Failed to set up Varlink, ignoring: %m"); + + /* Third, fire things up! */ + manager_coldplug(m); + + /* Clean up runtime objects */ + manager_vacuum(m); + + if (serialization) + /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the + * reload is finished */ + m->send_reloading_done = true; + } + + manager_ready(m); + + manager_set_switching_root(m, false); + + return 0; +} + +int manager_add_job( + Manager *m, + JobType type, + Unit *unit, + JobMode mode, + Set *affected_jobs, + sd_bus_error *error, + Job **ret) { + + _cleanup_(transaction_abort_and_freep) Transaction *tr = NULL; + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(unit); + assert(mode < _JOB_MODE_MAX); + + if (mode == JOB_ISOLATE && type != JOB_START) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Isolate is only valid for start."); + + if (mode == JOB_ISOLATE && !unit->allow_isolate) + return sd_bus_error_set(error, BUS_ERROR_NO_ISOLATION, "Operation refused, unit may not be isolated."); + + if (mode == JOB_TRIGGERING && type != JOB_STOP) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=triggering is only valid for stop."); + + if (mode == JOB_RESTART_DEPENDENCIES && type != JOB_START) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=restart-dependencies is only valid for start."); + + log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode)); + + type = job_type_collapse(type, unit); + + tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY); + if (!tr) + return -ENOMEM; + + r = transaction_add_job_and_dependencies( + tr, + type, + unit, + /* by= */ NULL, + TRANSACTION_MATTERS | + (IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS) ? TRANSACTION_IGNORE_REQUIREMENTS : 0) | + (mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0) | + (mode == JOB_RESTART_DEPENDENCIES ? TRANSACTION_PROPAGATE_START_AS_RESTART : 0), + error); + if (r < 0) + return r; + + if (mode == JOB_ISOLATE) { + r = transaction_add_isolate_jobs(tr, m); + if (r < 0) + return r; + } + + if (mode == JOB_TRIGGERING) { + r = transaction_add_triggering_jobs(tr, unit); + if (r < 0) + return r; + } + + r = transaction_activate(tr, m, mode, affected_jobs, error); + if (r < 0) + return r; + + log_unit_debug(unit, + "Enqueued job %s/%s as %u", unit->id, + job_type_to_string(type), (unsigned) tr->anchor_job->id); + + if (ret) + *ret = tr->anchor_job; + + tr = transaction_free(tr); + return 0; +} + +int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **ret) { + Unit *unit = NULL; /* just to appease gcc, initialization is not really necessary */ + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(name); + assert(mode < _JOB_MODE_MAX); + + r = manager_load_unit(m, name, NULL, NULL, &unit); + if (r < 0) + return r; + assert(unit); + + return manager_add_job(m, type, unit, mode, affected_jobs, e, ret); +} + +int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(name); + assert(mode < _JOB_MODE_MAX); + + r = manager_add_job_by_name(m, type, name, mode, affected_jobs, &error, ret); + if (r < 0) + return log_warning_errno(r, "Failed to enqueue %s job for %s: %s", job_mode_to_string(mode), name, bus_error_message(&error, r)); + + return r; +} + +int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) { + int r; + _cleanup_(transaction_abort_and_freep) Transaction *tr = NULL; + + assert(m); + assert(unit); + assert(mode < _JOB_MODE_MAX); + assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */ + + tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY); + if (!tr) + return -ENOMEM; + + /* We need an anchor job */ + r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, TRANSACTION_IGNORE_REQUIREMENTS|TRANSACTION_IGNORE_ORDER, e); + if (r < 0) + return r; + + /* Failure in adding individual dependencies is ignored, so this always succeeds. */ + transaction_add_propagate_reload_jobs( + tr, + unit, + tr->anchor_job, + mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0); + + r = transaction_activate(tr, m, mode, NULL, e); + if (r < 0) + return r; + + tr = transaction_free(tr); + return 0; +} + +Job *manager_get_job(Manager *m, uint32_t id) { + assert(m); + + return hashmap_get(m->jobs, UINT32_TO_PTR(id)); +} + +Unit *manager_get_unit(Manager *m, const char *name) { + assert(m); + assert(name); + + return hashmap_get(m->units, name); +} + +static int manager_dispatch_target_deps_queue(Manager *m) { + Unit *u; + int r = 0; + + assert(m); + + while ((u = LIST_POP(target_deps_queue, m->target_deps_queue))) { + _cleanup_free_ Unit **targets = NULL; + int n_targets; + + assert(u->in_target_deps_queue); + + u->in_target_deps_queue = false; + + /* Take an "atomic" snapshot of dependencies here, as the call below will likely modify the + * dependencies, and we can't have it that hash tables we iterate through are modified while + * we are iterating through them. */ + n_targets = unit_get_dependency_array(u, UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, &targets); + if (n_targets < 0) + return n_targets; + + for (int i = 0; i < n_targets; i++) { + r = unit_add_default_target_dependency(u, targets[i]); + if (r < 0) + return r; + } + } + + return r; +} + +unsigned manager_dispatch_load_queue(Manager *m) { + Unit *u; + unsigned n = 0; + + assert(m); + + /* Make sure we are not run recursively */ + if (m->dispatching_load_queue) + return 0; + + m->dispatching_load_queue = true; + + /* Dispatches the load queue. Takes a unit from the queue and + * tries to load its data until the queue is empty */ + + while ((u = m->load_queue)) { + assert(u->in_load_queue); + + unit_load(u); + n++; + } + + m->dispatching_load_queue = false; + + /* Dispatch the units waiting for their target dependencies to be added now, as all targets that we know about + * should be loaded and have aliases resolved */ + (void) manager_dispatch_target_deps_queue(m); + + return n; +} + +bool manager_unit_cache_should_retry_load(Unit *u) { + assert(u); + + /* Automatic reloading from disk only applies to units which were not found sometime in the past, and + * the not-found stub is kept pinned in the unit graph by dependencies. For units that were + * previously loaded, we don't do automatic reloading, and daemon-reload is necessary to update. */ + if (u->load_state != UNIT_NOT_FOUND) + return false; + + /* The cache has been updated since the last time we tried to load the unit. There might be new + * fragment paths to read. */ + if (u->manager->unit_cache_timestamp_hash != u->fragment_not_found_timestamp_hash) + return true; + + /* The cache needs to be updated because there are modifications on disk. */ + return !lookup_paths_timestamp_hash_same(&u->manager->lookup_paths, u->manager->unit_cache_timestamp_hash, NULL); +} + +int manager_load_unit_prepare( + Manager *m, + const char *name, + const char *path, + sd_bus_error *e, + Unit **ret) { + + _cleanup_(unit_freep) Unit *cleanup_unit = NULL; + _cleanup_free_ char *nbuf = NULL; + int r; + + assert(m); + assert(ret); + assert(name || path); + + /* This will prepare the unit for loading, but not actually load anything from disk. */ + + if (path && !path_is_absolute(path)) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + if (!name) { + r = path_extract_filename(path, &nbuf); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' refers to directory, refusing.", path); + + name = nbuf; + } + + UnitType t = unit_name_to_type(name); + + if (t == _UNIT_TYPE_INVALID || !unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is missing the instance name.", name); + + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is not valid.", name); + } + + Unit *unit = manager_get_unit(m, name); + if (unit) { + /* The time-based cache allows to start new units without daemon-reload, + * but if they are already referenced (because of dependencies or ordering) + * then we have to force a load of the fragment. As an optimization, check + * first if anything in the usual paths was modified since the last time + * the cache was loaded. Also check if the last time an attempt to load the + * unit was made was before the most recent cache refresh, so that we know + * we need to try again — even if the cache is current, it might have been + * updated in a different context before we had a chance to retry loading + * this particular unit. */ + if (manager_unit_cache_should_retry_load(unit)) + unit->load_state = UNIT_STUB; + else { + *ret = unit; + return 0; /* The unit was already loaded */ + } + } else { + unit = cleanup_unit = unit_new(m, unit_vtable[t]->object_size); + if (!unit) + return -ENOMEM; + } + + if (path) { + r = free_and_strdup(&unit->fragment_path, path); + if (r < 0) + return r; + } + + r = unit_add_name(unit, name); + if (r < 0) + return r; + + unit_add_to_load_queue(unit); + unit_add_to_dbus_queue(unit); + unit_add_to_gc_queue(unit); + + *ret = unit; + TAKE_PTR(cleanup_unit); + + return 1; /* The unit was added the load queue */ +} + +int manager_load_unit( + Manager *m, + const char *name, + const char *path, + sd_bus_error *e, + Unit **ret) { + int r; + + assert(m); + assert(ret); + + /* This will load the unit config, but not actually start any services or anything. */ + + r = manager_load_unit_prepare(m, name, path, e, ret); + if (r <= 0) + return r; + + /* Unit was newly loaded */ + manager_dispatch_load_queue(m); + *ret = unit_follow_merge(*ret); + return 0; +} + +int manager_load_startable_unit_or_warn( + Manager *m, + const char *name, + const char *path, + Unit **ret) { + + /* Load a unit, make sure it loaded fully and is not masked. */ + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *unit; + int r; + + r = manager_load_unit(m, name, path, &error, &unit); + if (r < 0) + return log_error_errno(r, "Failed to load %s %s: %s", + name ? "unit" : "unit file", name ?: path, + bus_error_message(&error, r)); + + r = bus_unit_validate_load_state(unit, &error); + if (r < 0) + return log_error_errno(r, "%s", bus_error_message(&error, r)); + + *ret = unit; + return 0; +} + +void manager_clear_jobs(Manager *m) { + Job *j; + + assert(m); + + while ((j = hashmap_first(m->jobs))) + /* No need to recurse. We're cancelling all jobs. */ + job_finish_and_invalidate(j, JOB_CANCELED, false, false); +} + +void manager_unwatch_pidref(Manager *m, PidRef *pid) { + assert(m); + + for (;;) { + Unit *u; + + u = manager_get_unit_by_pidref_watching(m, pid); + if (!u) + break; + + unit_unwatch_pidref(u, pid); + } +} + +static int manager_dispatch_run_queue(sd_event_source *source, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Job *j; + + assert(source); + + while ((j = prioq_peek(m->run_queue))) { + assert(j->installed); + assert(j->in_run_queue); + + (void) job_run_and_invalidate(j); + } + + if (m->n_running_jobs > 0) + manager_watch_jobs_in_progress(m); + + if (m->n_on_console > 0) + manager_watch_idle_pipe(m); + + return 1; +} + +void manager_trigger_run_queue(Manager *m) { + int r; + + assert(m); + + r = sd_event_source_set_enabled( + m->run_queue_event_source, + prioq_isempty(m->run_queue) ? SD_EVENT_OFF : SD_EVENT_ONESHOT); + if (r < 0) + log_warning_errno(r, "Failed to enable job run queue event source, ignoring: %m"); +} + +static unsigned manager_dispatch_dbus_queue(Manager *m) { + unsigned n = 0, budget; + Unit *u; + Job *j; + + assert(m); + + /* When we are reloading, let's not wait with generating signals, since we need to exit the manager as quickly + * as we can. There's no point in throttling generation of signals in that case. */ + if (MANAGER_IS_RELOADING(m) || m->send_reloading_done || m->pending_reload_message) + budget = UINT_MAX; /* infinite budget in this case */ + else { + /* Anything to do at all? */ + if (!m->dbus_unit_queue && !m->dbus_job_queue) + return 0; + + /* Do we have overly many messages queued at the moment? If so, let's not enqueue more on top, let's + * sit this cycle out, and process things in a later cycle when the queues got a bit emptier. */ + if (manager_bus_n_queued_write(m) > MANAGER_BUS_BUSY_THRESHOLD) + return 0; + + /* Only process a certain number of units/jobs per event loop iteration. Even if the bus queue wasn't + * overly full before this call we shouldn't increase it in size too wildly in one step, and we + * shouldn't monopolize CPU time with generating these messages. Note the difference in counting of + * this "budget" and the "threshold" above: the "budget" is decreased only once per generated message, + * regardless how many buses/direct connections it is enqueued on, while the "threshold" is applied to + * each queued instance of bus message, i.e. if the same message is enqueued to five buses/direct + * connections it will be counted five times. This difference in counting ("references" + * vs. "instances") is primarily a result of the fact that it's easier to implement it this way, + * however it also reflects the thinking that the "threshold" should put a limit on used queue memory, + * i.e. space, while the "budget" should put a limit on time. Also note that the "threshold" is + * currently chosen much higher than the "budget". */ + budget = MANAGER_BUS_MESSAGE_BUDGET; + } + + while (budget != 0 && (u = m->dbus_unit_queue)) { + + assert(u->in_dbus_queue); + + bus_unit_send_change_signal(u); + n++; + + if (budget != UINT_MAX) + budget--; + } + + while (budget != 0 && (j = m->dbus_job_queue)) { + assert(j->in_dbus_queue); + + bus_job_send_change_signal(j); + n++; + + if (budget != UINT_MAX) + budget--; + } + + if (m->send_reloading_done) { + m->send_reloading_done = false; + bus_manager_send_reloading(m, false); + n++; + } + + if (m->pending_reload_message) { + bus_send_pending_reload_message(m); + n++; + } + + return n; +} + +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + char buf[PATH_MAX]; + ssize_t n; + + n = recv(fd, buf, sizeof(buf), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read cgroups agent message: %m"); + if (n == 0) { + log_error("Got zero-length cgroups agent message, ignoring."); + return 0; + } + if ((size_t) n >= sizeof(buf)) { + log_error("Got overly long cgroups agent message, ignoring."); + return 0; + } + + if (memchr(buf, 0, n)) { + log_error("Got cgroups agent message with embedded NUL byte, ignoring."); + return 0; + } + buf[n] = 0; + + manager_notify_cgroup_empty(m, buf); + (void) bus_forward_agent_released(m, buf); + + return 0; +} + +static bool manager_process_barrier_fd(char * const *tags, FDSet *fds) { + + /* nothing else must be sent when using BARRIER=1 */ + if (strv_contains(tags, "BARRIER=1")) { + if (strv_length(tags) != 1) + log_warning("Extra notification messages sent with BARRIER=1, ignoring everything."); + else if (fdset_size(fds) != 1) + log_warning("Got incorrect number of fds with BARRIER=1, closing them."); + + /* Drop the message if BARRIER=1 was found */ + return true; + } + + return false; +} + +static void manager_invoke_notify_message( + Manager *m, + Unit *u, + const struct ucred *ucred, + char * const *tags, + FDSet *fds) { + + assert(m); + assert(u); + assert(ucred); + assert(tags); + + if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */ + return; + u->notifygen = m->notifygen; + + if (UNIT_VTABLE(u)->notify_message) + UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds); + + else if (DEBUG_LOGGING) { + _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL; + + buf = strv_join(tags, ", "); + if (buf) + x = ellipsize(buf, 20, 90); + if (x) + y = cescape(x); + + log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y)); + } +} + +static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + + _cleanup_fdset_free_ FDSet *fds = NULL; + Manager *m = ASSERT_PTR(userdata); + char buf[NOTIFY_BUFFER_MAX+1]; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + struct cmsghdr *cmsg; + struct ucred *ucred = NULL; + _cleanup_free_ Unit **array_copy = NULL; + _cleanup_strv_free_ char **tags = NULL; + Unit *u1, *u2, **array; + int r, *fd_array = NULL; + size_t n_fds = 0; + bool found = false; + ssize_t n; + + assert(m->notify_fd == fd); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for notify fd."); + return 0; + } + + n = recvmsg_safe(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; /* Spurious wakeup, try again */ + if (n == -EXFULL) { + log_warning("Got message with truncated control data (too many fds sent?), ignoring."); + return 0; + } + if (n < 0) + /* If this is any other, real error, then stop processing this socket. This of course means + * we won't take notification messages anymore, but that's still better than busy looping: + * being woken up over and over again, but being unable to actually read the message from the + * socket. */ + return log_error_errno(n, "Failed to receive notification message: %m"); + + CMSG_FOREACH(cmsg, &msghdr) + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + + assert(!fd_array); + fd_array = CMSG_TYPED_DATA(cmsg, int); + n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + + assert(!ucred); + ucred = CMSG_TYPED_DATA(cmsg, struct ucred); + } + + if (n_fds > 0) { + assert(fd_array); + + r = fdset_new_array(&fds, fd_array, n_fds); + if (r < 0) { + close_many(fd_array, n_fds); + log_oom(); + return 0; + } + } + + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } + + if ((size_t) n >= sizeof(buf) || (msghdr.msg_flags & MSG_TRUNC)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + /* As extra safety check, let's make sure the string we get doesn't contain embedded NUL bytes. + * We permit one trailing NUL byte in the message, but don't expect it. */ + if (n > 1 && memchr(buf, 0, n-1)) { + log_warning("Received notify message with embedded NUL bytes. Ignoring."); + return 0; + } + + /* Make sure it's NUL-terminated, then parse it to obtain the tags list. */ + buf[n] = 0; + tags = strv_split_newlines(buf); + if (!tags) { + log_oom(); + return 0; + } + + /* Possibly a barrier fd, let's see. */ + if (manager_process_barrier_fd(tags, fds)) { + log_debug("Received barrier notification message from PID " PID_FMT ".", ucred->pid); + return 0; + } + + /* Increase the generation counter used for filtering out duplicate unit invocations. */ + m->notifygen++; + + /* Generate lookup key from the PID (we have no pidfd here, after all) */ + PidRef pidref = PIDREF_MAKE_FROM_PID(ucred->pid); + + /* Notify every unit that might be interested, which might be multiple. */ + u1 = manager_get_unit_by_pidref_cgroup(m, &pidref); + u2 = hashmap_get(m->watch_pids, &pidref); + array = hashmap_get(m->watch_pids_more, &pidref); + if (array) { + size_t k = 0; + + while (array[k]) + k++; + + array_copy = newdup(Unit*, array, k+1); + if (!array_copy) + log_oom(); + } + /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle + * duplicate units make sure we only invoke each unit's handler once. */ + if (u1) { + manager_invoke_notify_message(m, u1, ucred, tags, fds); + found = true; + } + if (u2) { + manager_invoke_notify_message(m, u2, ucred, tags, fds); + found = true; + } + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) { + manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds); + found = true; + } + + if (!found) + log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid); + + if (fdset_size(fds) > 0) + log_warning("Got extra auxiliary fds with notification message, closing them."); + + return 0; +} + +static void manager_invoke_sigchld_event( + Manager *m, + Unit *u, + const siginfo_t *si) { + + assert(m); + assert(u); + assert(si); + + /* Already invoked the handler of this unit in this iteration? Then don't process this again */ + if (u->sigchldgen == m->sigchldgen) + return; + u->sigchldgen = m->sigchldgen; + + log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id); + unit_unwatch_pid(u, si->si_pid); + + if (UNIT_VTABLE(u)->sigchld_event) + UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); +} + +static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + siginfo_t si = {}; + int r; + + assert(source); + + /* First we call waitid() for a PID and do not reap the zombie. That way we can still access + * /proc/$PID for it while it is a zombie. */ + + if (waitid(P_ALL, 0, &si, WEXITED|WNOHANG|WNOWAIT) < 0) { + + if (errno != ECHILD) + log_error_errno(errno, "Failed to peek for child with waitid(), ignoring: %m"); + + goto turn_off; + } + + if (si.si_pid <= 0) + goto turn_off; + + if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) { + _cleanup_free_ Unit **array_copy = NULL; + _cleanup_free_ char *name = NULL; + Unit *u1, *u2, **array; + + (void) pid_get_comm(si.si_pid, &name); + + log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)", + si.si_pid, strna(name), + sigchld_code_to_string(si.si_code), + si.si_status, + strna(si.si_code == CLD_EXITED + ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL) + : signal_to_string(si.si_status))); + + /* Increase the generation counter used for filtering out duplicate unit invocations */ + m->sigchldgen++; + + /* We look this up by a PidRef that only consists of the PID. After all we couldn't create a + * pidfd here any more even if we wanted (since the process just exited). */ + PidRef pidref = PIDREF_MAKE_FROM_PID(si.si_pid); + + /* And now figure out the unit this belongs to, it might be multiple... */ + u1 = manager_get_unit_by_pidref_cgroup(m, &pidref); + u2 = hashmap_get(m->watch_pids, &pidref); + array = hashmap_get(m->watch_pids_more, &pidref); + if (array) { + size_t n = 0; + + /* Count how many entries the array has */ + while (array[n]) + n++; + + /* Make a copy of the array so that we don't trip up on the array changing beneath us */ + array_copy = newdup(Unit*, array, n+1); + if (!array_copy) + log_oom(); + } + + /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but + * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for + * each iteration. */ + if (u1) { + /* We check for oom condition, in case we got SIGCHLD before the oom notification. + * We only do this for the cgroup the PID belonged to. */ + (void) unit_check_oom(u1); + + /* We check if systemd-oomd performed a kill so that we log and notify appropriately */ + (void) unit_check_oomd_kill(u1); + + manager_invoke_sigchld_event(m, u1, &si); + } + if (u2) + manager_invoke_sigchld_event(m, u2, &si); + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) + manager_invoke_sigchld_event(m, array_copy[i], &si); + } + + /* And now, we actually reap the zombie. */ + if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) { + log_error_errno(errno, "Failed to dequeue child, ignoring: %m"); + return 0; + } + + return 0; + +turn_off: + /* All children processed for now, turn off event source */ + + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to disable SIGCHLD event source: %m"); + + return 0; +} + +static void manager_start_special(Manager *m, const char *name, JobMode mode) { + Job *job; + + if (manager_add_job_by_name_and_warn(m, JOB_START, name, mode, NULL, &job) < 0) + return; + + const char *s = unit_status_string(job->unit, NULL); + + log_info("Activating special unit %s...", s); + + sd_notifyf(false, + "STATUS=Activating special unit %s...", s); + m->status_ready = false; +} + +static void manager_handle_ctrl_alt_del(Manager *m) { + /* If the user presses C-A-D more than + * 7 times within 2s, we reboot/shutdown immediately, + * unless it was disabled in system.conf */ + + if (ratelimit_below(&m->ctrl_alt_del_ratelimit) || m->cad_burst_action == EMERGENCY_ACTION_NONE) + manager_start_special(m, SPECIAL_CTRL_ALT_DEL_TARGET, JOB_REPLACE_IRREVERSIBLY); + else + emergency_action(m, m->cad_burst_action, EMERGENCY_ACTION_WARN, NULL, -1, + "Ctrl-Alt-Del was pressed more than 7 times within 2s"); +} + +static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + ssize_t n; + struct signalfd_siginfo sfsi; + int r; + + assert(m->signal_fd == fd); + + if (revents != EPOLLIN) { + log_warning("Got unexpected events from signal file descriptor."); + return 0; + } + + n = read(m->signal_fd, &sfsi, sizeof(sfsi)); + if (n < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + /* We return an error here, which will kill this handler, + * to avoid a busy loop on read error. */ + return log_error_errno(errno, "Reading from signal fd failed: %m"); + } + if (n != sizeof(sfsi)) { + log_warning("Truncated read from signal fd (%zi bytes), ignoring!", n); + return 0; + } + + log_received_signal(sfsi.ssi_signo == SIGCHLD || + (sfsi.ssi_signo == SIGTERM && MANAGER_IS_USER(m)) + ? LOG_DEBUG : LOG_INFO, + &sfsi); + + switch (sfsi.ssi_signo) { + + case SIGCHLD: + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to enable SIGCHLD event source, ignoring: %m"); + + break; + + case SIGTERM: + if (MANAGER_IS_SYSTEM(m)) { + /* This is for compatibility with the original sysvinit */ + if (verify_run_space_and_log("Refusing to reexecute") < 0) + break; + + m->objective = MANAGER_REEXECUTE; + break; + } + + _fallthrough_; + case SIGINT: + if (MANAGER_IS_SYSTEM(m)) + manager_handle_ctrl_alt_del(m); + else + manager_start_special(m, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY); + break; + + case SIGWINCH: + /* This is a nop on non-init */ + if (MANAGER_IS_SYSTEM(m)) + manager_start_special(m, SPECIAL_KBREQUEST_TARGET, JOB_REPLACE); + + break; + + case SIGPWR: + /* This is a nop on non-init */ + if (MANAGER_IS_SYSTEM(m)) + manager_start_special(m, SPECIAL_SIGPWR_TARGET, JOB_REPLACE); + + break; + + case SIGUSR1: + if (manager_dbus_is_running(m, false)) { + log_info("Trying to reconnect to bus..."); + + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } else + manager_start_special(m, SPECIAL_DBUS_SERVICE, JOB_REPLACE); + + break; + + case SIGUSR2: { + _cleanup_free_ char *dump = NULL; + + r = manager_get_dump_string(m, /* patterns= */ NULL, &dump); + if (r < 0) { + log_warning_errno(errno, "Failed to acquire manager dump: %m"); + break; + } + + log_dump(LOG_INFO, dump); + break; + } + + case SIGHUP: + if (verify_run_space_and_log("Refusing to reload") < 0) + break; + + m->objective = MANAGER_RELOAD; + break; + + default: { + + /* Starting SIGRTMIN+0 */ + static const struct { + const char *target; + JobMode mode; + } target_table[] = { + [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE }, + [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE }, + [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE }, + [3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [6] = { SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [7] = { SPECIAL_SOFT_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + }; + + /* Starting SIGRTMIN+13, so that target halt and system halt are 10 apart */ + static const ManagerObjective objective_table[] = { + [0] = MANAGER_HALT, + [1] = MANAGER_POWEROFF, + [2] = MANAGER_REBOOT, + [3] = MANAGER_KEXEC, + [4] = MANAGER_SOFT_REBOOT, + }; + + if ((int) sfsi.ssi_signo >= SIGRTMIN+0 && + (int) sfsi.ssi_signo < SIGRTMIN+(int) ELEMENTSOF(target_table)) { + int idx = (int) sfsi.ssi_signo - SIGRTMIN; + manager_start_special(m, target_table[idx].target, target_table[idx].mode); + break; + } + + if ((int) sfsi.ssi_signo >= SIGRTMIN+13 && + (int) sfsi.ssi_signo < SIGRTMIN+13+(int) ELEMENTSOF(objective_table)) { + m->objective = objective_table[sfsi.ssi_signo - SIGRTMIN - 13]; + break; + } + + switch (sfsi.ssi_signo - SIGRTMIN) { + + case 18: { + bool generic = false; + + if (sfsi.ssi_code != SI_QUEUE) + generic = true; + else { + /* Override a few select commands by our own PID1-specific logic */ + + switch (sfsi.ssi_int) { + + case _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE..._COMMON_SIGNAL_COMMAND_LOG_LEVEL_END: + manager_override_log_level(m, sfsi.ssi_int - _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE); + break; + + case COMMON_SIGNAL_COMMAND_CONSOLE: + manager_override_log_target(m, LOG_TARGET_CONSOLE); + break; + + case COMMON_SIGNAL_COMMAND_JOURNAL: + manager_override_log_target(m, LOG_TARGET_JOURNAL); + break; + + case COMMON_SIGNAL_COMMAND_KMSG: + manager_override_log_target(m, LOG_TARGET_KMSG); + break; + + case COMMON_SIGNAL_COMMAND_NULL: + manager_override_log_target(m, LOG_TARGET_NULL); + break; + + case MANAGER_SIGNAL_COMMAND_DUMP_JOBS: { + _cleanup_free_ char *dump_jobs = NULL; + + r = manager_get_dump_jobs_string(m, /* patterns= */ NULL, " ", &dump_jobs); + if (r < 0) { + log_warning_errno(errno, "Failed to acquire manager jobs dump: %m"); + break; + } + + log_dump(LOG_INFO, dump_jobs); + break; + } + + default: + generic = true; + } + } + + if (generic) + return sigrtmin18_handler(source, &sfsi, NULL); + + break; + } + + case 20: + manager_override_show_status(m, SHOW_STATUS_YES, "signal"); + break; + + case 21: + manager_override_show_status(m, SHOW_STATUS_NO, "signal"); + break; + + case 22: + manager_override_log_level(m, LOG_DEBUG); + break; + + case 23: + manager_restore_original_log_level(m); + break; + + case 24: + if (MANAGER_IS_USER(m)) { + m->objective = MANAGER_EXIT; + return 0; + } + + /* This is a nop on init */ + break; + + case 25: + m->objective = MANAGER_REEXECUTE; + break; + + case 26: + case 29: /* compatibility: used to be mapped to LOG_TARGET_SYSLOG_OR_KMSG */ + manager_restore_original_log_target(m); + break; + + case 27: + manager_override_log_target(m, LOG_TARGET_CONSOLE); + break; + + case 28: + manager_override_log_target(m, LOG_TARGET_KMSG); + break; + + default: + log_warning("Got unhandled signal <%s>.", signal_to_string(sfsi.ssi_signo)); + } + }} + + return 0; +} + +static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Unit *u; + + log_struct(LOG_DEBUG, + "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR, + LOG_MESSAGE("Time has been changed")); + + /* Restart the watch */ + (void) manager_setup_time_change(m); + + HASHMAP_FOREACH(u, m->units) + if (UNIT_VTABLE(u)->time_change) + UNIT_VTABLE(u)->time_change(u); + + return 0; +} + +static int manager_dispatch_timezone_change( + sd_event_source *source, + const struct inotify_event *e, + void *userdata) { + + Manager *m = ASSERT_PTR(userdata); + int changed; + Unit *u; + + log_debug("inotify event for /etc/localtime"); + + changed = manager_read_timezone_stat(m); + if (changed <= 0) + return changed; + + /* Something changed, restart the watch, to ensure we watch the new /etc/localtime if it changed */ + (void) manager_setup_timezone_change(m); + + /* Read the new timezone */ + tzset(); + + log_debug("Timezone has been changed (now: %s).", tzname[daylight]); + + HASHMAP_FOREACH(u, m->units) + if (UNIT_VTABLE(u)->timezone_change) + UNIT_VTABLE(u)->timezone_change(u); + + return 0; +} + +static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(m->idle_pipe[2] == fd); + + /* There's at least one Type=idle child that just gave up on us waiting for the boot process to + * complete. Let's now turn off any further console output if there's at least one service that needs + * console access, so that from now on our own output should not spill into that service's output + * anymore. After all, we support Type=idle only to beautify console output and it generally is set + * on services that want to own the console exclusively without our interference. */ + m->no_console_output = m->n_on_console > 0; + + /* Acknowledge the child's request, and let all other children know too that they shouldn't wait + * any longer by closing the pipes towards them, which is what they are waiting for. */ + manager_close_idle_pipe(m); + + return 0; +} + +static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(source); + + manager_print_jobs_in_progress(m); + + r = sd_event_source_set_time_relative(source, JOBS_IN_PROGRESS_PERIOD_USEC); + if (r < 0) + return r; + + return sd_event_source_set_enabled(source, SD_EVENT_ONESHOT); +} + +int manager_loop(Manager *m) { + RateLimit rl = { .interval = 1*USEC_PER_SEC, .burst = 50000 }; + int r; + + assert(m); + assert(m->objective == MANAGER_OK); /* Ensure manager_startup() has been called */ + + manager_check_finished(m); + + /* There might still be some zombies hanging around from before we were exec()'ed. Let's reap them. */ + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to enable SIGCHLD event source: %m"); + + while (m->objective == MANAGER_OK) { + + (void) watchdog_ping(); + + if (!ratelimit_below(&rl)) { + /* Yay, something is going seriously wrong, pause a little */ + log_warning("Looping too fast. Throttling execution a little."); + sleep(1); + } + + if (manager_dispatch_load_queue(m) > 0) + continue; + + if (manager_dispatch_gc_job_queue(m) > 0) + continue; + + if (manager_dispatch_gc_unit_queue(m) > 0) + continue; + + if (manager_dispatch_cleanup_queue(m) > 0) + continue; + + if (manager_dispatch_cgroup_realize_queue(m) > 0) + continue; + + if (manager_dispatch_start_when_upheld_queue(m) > 0) + continue; + + if (manager_dispatch_stop_when_bound_queue(m) > 0) + continue; + + if (manager_dispatch_stop_when_unneeded_queue(m) > 0) + continue; + + if (manager_dispatch_release_resources_queue(m) > 0) + continue; + + if (manager_dispatch_dbus_queue(m) > 0) + continue; + + /* Sleep for watchdog runtime wait time */ + r = sd_event_run(m->event, watchdog_runtime_wait()); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + } + + return m->objective; +} + +int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) { + _cleanup_free_ char *n = NULL; + sd_id128_t invocation_id; + Unit *u; + int r; + + assert(m); + assert(s); + assert(_u); + + r = unit_name_from_dbus_path(s, &n); + if (r < 0) + return r; + + /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128-bit ID then + * we use it as invocation ID. */ + r = sd_id128_from_string(n, &invocation_id); + if (r >= 0) { + u = hashmap_get(m->units_by_invocation_id, &invocation_id); + if (u) { + *_u = u; + return 0; + } + + return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, + "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", + SD_ID128_FORMAT_VAL(invocation_id)); + } + + /* If this didn't work, we check if this is a unit name */ + if (!unit_name_is_valid(n, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + _cleanup_free_ char *nn = NULL; + + nn = cescape(n); + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, + "Unit name %s is neither a valid invocation ID nor unit name.", strnull(nn)); + } + + r = manager_load_unit(m, n, NULL, e, &u); + if (r < 0) + return r; + + *_u = u; + return 0; +} + +int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j) { + const char *p; + unsigned id; + Job *j; + int r; + + assert(m); + assert(s); + assert(_j); + + p = startswith(s, "/org/freedesktop/systemd1/job/"); + if (!p) + return -EINVAL; + + r = safe_atou(p, &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return -ENOENT; + + *_j = j; + + return 0; +} + +void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) { + +#if HAVE_AUDIT + _cleanup_free_ char *p = NULL; + const char *msg; + int audit_fd, r; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + audit_fd = get_audit_fd(); + if (audit_fd < 0) + return; + + /* Don't generate audit events if the service was already + * started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + + r = unit_name_to_prefix_and_instance(u->id, &p); + if (r < 0) { + log_warning_errno(r, "Failed to extract prefix and instance of unit name, ignoring: %m"); + return; + } + + msg = strjoina("unit=", p); + if (audit_log_user_comm_message(audit_fd, type, msg, "systemd", NULL, NULL, NULL, success) < 0) { + if (ERRNO_IS_PRIVILEGE(errno)) { + /* We aren't allowed to send audit messages? Then let's not retry again. */ + log_debug_errno(errno, "Failed to send audit message, closing audit socket: %m"); + close_audit_fd(); + } else + log_warning_errno(errno, "Failed to send audit message, ignoring: %m"); + } +#endif + +} + +void manager_send_unit_plymouth(Manager *m, Unit *u) { + _cleanup_free_ char *message = NULL; + int c, r; + + /* Don't generate plymouth events if the service was already + * started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (detect_container() > 0) + return; + + if (!UNIT_VTABLE(u)->notify_plymouth) + return; + + c = asprintf(&message, "U\x02%c%s%c", (int) (strlen(u->id) + 1), u->id, '\x00'); + if (c < 0) + return (void) log_oom(); + + /* We set SOCK_NONBLOCK here so that we rather drop the message then wait for plymouth */ + r = plymouth_send_raw(message, c, SOCK_NONBLOCK); + if (r < 0) + log_full_errno(ERRNO_IS_NO_PLYMOUTH(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to communicate with plymouth: %m"); +} + +usec_t manager_get_watchdog(Manager *m, WatchdogType t) { + assert(m); + + if (MANAGER_IS_USER(m)) + return USEC_INFINITY; + + if (m->watchdog_overridden[t] != USEC_INFINITY) + return m->watchdog_overridden[t]; + + return m->watchdog[t]; +} + +void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) { + + assert(m); + + if (MANAGER_IS_USER(m)) + return; + + if (m->watchdog[t] == timeout) + return; + + if (m->watchdog_overridden[t] == USEC_INFINITY) { + if (t == WATCHDOG_RUNTIME) + (void) watchdog_setup(timeout); + else if (t == WATCHDOG_PRETIMEOUT) + (void) watchdog_setup_pretimeout(timeout); + } + + m->watchdog[t] = timeout; +} + +void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) { + usec_t usec; + + assert(m); + + if (MANAGER_IS_USER(m)) + return; + + if (m->watchdog_overridden[t] == timeout) + return; + + usec = timeout == USEC_INFINITY ? m->watchdog[t] : timeout; + if (t == WATCHDOG_RUNTIME) + (void) watchdog_setup(usec); + else if (t == WATCHDOG_PRETIMEOUT) + (void) watchdog_setup_pretimeout(usec); + + m->watchdog_overridden[t] = timeout; +} + +int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor) { + _cleanup_free_ char *p = NULL; + int r; + + assert(m); + + if (MANAGER_IS_USER(m)) + return 0; + + if (streq_ptr(m->watchdog_pretimeout_governor, governor)) + return 0; + + p = strdup(governor); + if (!p) + return -ENOMEM; + + r = watchdog_setup_pretimeout_governor(governor); + if (r < 0) + return r; + + return free_and_replace(m->watchdog_pretimeout_governor, p); +} + +int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor) { + _cleanup_free_ char *p = NULL; + int r; + + assert(m); + + if (MANAGER_IS_USER(m)) + return 0; + + if (streq_ptr(m->watchdog_pretimeout_governor_overridden, governor)) + return 0; + + p = strdup(governor); + if (!p) + return -ENOMEM; + + r = watchdog_setup_pretimeout_governor(governor); + if (r < 0) + return r; + + return free_and_replace(m->watchdog_pretimeout_governor_overridden, p); +} + +int manager_reload(Manager *m) { + _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL; + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + /* We are officially in reload mode from here on. */ + reloading = manager_reloading_start(m); + + r = manager_serialize(m, f, fds, false); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to beginning of serialization: %m"); + + /* 💀 This is the point of no return, from here on there is no way back. 💀 */ + reloading = NULL; + + bus_manager_send_reloading(m, true); + + /* Start by flushing out all jobs and units, all generated units, all runtime environments, all dynamic users + * and everything else that is worth flushing out. We'll get it all back from the serialization — if we need + * it. */ + + manager_clear_jobs_and_units(m); + lookup_paths_flush_generator(&m->lookup_paths); + lookup_paths_free(&m->lookup_paths); + exec_shared_runtime_vacuum(m); + dynamic_user_vacuum(m, false); + m->uid_refs = hashmap_free(m->uid_refs); + m->gid_refs = hashmap_free(m->gid_refs); + + r = lookup_paths_init_or_warn(&m->lookup_paths, m->runtime_scope, 0, NULL); + if (r < 0) + return r; + + (void) manager_run_environment_generators(m); + (void) manager_run_generators(m); + + lookup_paths_log(&m->lookup_paths); + + /* We flushed out generated files, for which we don't watch mtime, so we should flush the old map. */ + manager_free_unit_name_maps(m); + m->unit_file_state_outdated = false; + + /* First, enumerate what we can from kernel and suchlike */ + manager_enumerate_perpetual(m); + manager_enumerate(m); + + /* Second, deserialize our stored data */ + r = manager_deserialize(m, f, fds); + if (r < 0) + log_warning_errno(r, "Deserialization failed, proceeding anyway: %m"); + + /* We don't need the serialization anymore */ + f = safe_fclose(f); + + /* Re-register notify_fd as event source, and set up other sockets/communication channels we might need */ + (void) manager_setup_notify(m); + (void) manager_setup_cgroups_agent(m); + (void) manager_setup_user_lookup_fd(m); + + /* Third, fire things up! */ + manager_coldplug(m); + + /* Clean up runtime objects no longer referenced */ + manager_vacuum(m); + + /* Clean up deserialized tracked clients */ + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + + /* Consider the reload process complete now. */ + assert(m->n_reloading > 0); + m->n_reloading--; + + manager_ready(m); + + m->send_reloading_done = true; + return 0; +} + +void manager_reset_failed(Manager *m) { + Unit *u; + + assert(m); + + HASHMAP_FOREACH(u, m->units) + unit_reset_failed(u); +} + +bool manager_unit_inactive_or_pending(Manager *m, const char *name) { + Unit *u; + + assert(m); + assert(name); + + /* Returns true if the unit is inactive or going down */ + u = manager_get_unit(m, name); + if (!u) + return true; + + return unit_inactive_or_pending(u); +} + +static void log_taint_string(Manager *m) { + _cleanup_free_ char *taint = NULL; + + assert(m); + + if (MANAGER_IS_USER(m) || m->taint_logged) + return; + + m->taint_logged = true; /* only check for taint once */ + + taint = manager_taint_string(m); + if (isempty(taint)) + return; + + log_struct(LOG_NOTICE, + LOG_MESSAGE("System is tainted: %s", taint), + "TAINT=%s", taint, + "MESSAGE_ID=" SD_MESSAGE_TAINTED_STR); +} + +static void manager_notify_finished(Manager *m) { + usec_t firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec; + + if (MANAGER_IS_TEST_RUN(m)) + return; + + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) { + char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")] + = {}; + char *p = buf; + size_t size = sizeof buf; + + /* Note that MANAGER_TIMESTAMP_KERNEL's monotonic value is always at 0, and + * MANAGER_TIMESTAMP_FIRMWARE's and MANAGER_TIMESTAMP_LOADER's monotonic value should be considered + * negative values. */ + + firmware_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic - m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic; + loader_usec = m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + total_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic + m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic; + + if (firmware_usec > 0) + size = strpcpyf(&p, size, "%s (firmware) + ", FORMAT_TIMESPAN(firmware_usec, USEC_PER_MSEC)); + if (loader_usec > 0) + size = strpcpyf(&p, size, "%s (loader) + ", FORMAT_TIMESPAN(loader_usec, USEC_PER_MSEC)); + + if (dual_timestamp_is_set(&m->timestamps[MANAGER_TIMESTAMP_INITRD])) { + + /* The initrd case on bare-metal */ + kernel_usec = m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + initrd_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "KERNEL_USEC="USEC_FMT, kernel_usec, + "INITRD_USEC="USEC_FMT, initrd_usec, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (initrd) + %s (userspace) = %s.", + buf, + FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC), + FORMAT_TIMESPAN(initrd_usec, USEC_PER_MSEC), + FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC), + FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC))); + } else { + /* The initrd-less case on bare-metal */ + + kernel_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + initrd_usec = 0; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "KERNEL_USEC="USEC_FMT, kernel_usec, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (userspace) = %s.", + buf, + FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC), + FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC), + FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC))); + } + } else { + /* The container and --user case */ + firmware_usec = loader_usec = initrd_usec = kernel_usec = 0; + total_usec = userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_USER_STARTUP_FINISHED_STR, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s.", + FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC))); + } + + bus_manager_send_finished(m, firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec); + + log_taint_string(m); +} + +static void user_manager_send_ready(Manager *m) { + int r; + + assert(m); + + /* We send READY=1 on reaching basic.target only when running in --user mode. */ + if (!MANAGER_IS_USER(m) || m->ready_sent) + return; + + r = sd_notify(false, + "READY=1\n" + "STATUS=Reached " SPECIAL_BASIC_TARGET "."); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + + m->ready_sent = true; + m->status_ready = false; +} + +static void manager_send_ready(Manager *m) { + int r; + + if (m->ready_sent && m->status_ready) + /* Skip the notification if nothing changed. */ + return; + + r = sd_notify(false, + "READY=1\n" + "STATUS=Ready."); + if (r < 0) + log_full_errno(m->ready_sent ? LOG_DEBUG : LOG_WARNING, r, + "Failed to send readiness notification, ignoring: %m"); + + m->ready_sent = m->status_ready = true; +} + +static void manager_check_basic_target(Manager *m) { + Unit *u; + + assert(m); + + /* Small shortcut */ + if (m->ready_sent && m->taint_logged) + return; + + u = manager_get_unit(m, SPECIAL_BASIC_TARGET); + if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + /* For user managers, send out READY=1 as soon as we reach basic.target */ + user_manager_send_ready(m); + + /* Log the taint string as soon as we reach basic.target */ + log_taint_string(m); +} + +void manager_check_finished(Manager *m) { + assert(m); + + if (MANAGER_IS_RELOADING(m)) + return; + + /* Verify that we have entered the event loop already, and not left it again. */ + if (!MANAGER_IS_RUNNING(m)) + return; + + manager_check_basic_target(m); + + if (hashmap_size(m->jobs) > 0) { + if (m->jobs_in_progress_event_source) + /* Ignore any failure, this is only for feedback */ + (void) sd_event_source_set_time(m->jobs_in_progress_event_source, + manager_watch_jobs_next_time(m)); + return; + } + + /* The jobs hashmap tends to grow a lot during boot, and then it's not reused until shutdown. Let's + kill the hashmap if it is relatively large. */ + if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10) + m->jobs = hashmap_free(m->jobs); + + manager_send_ready(m); + + /* Notify Type=idle units that we are done now */ + manager_close_idle_pipe(m); + + if (MANAGER_IS_FINISHED(m)) + return; + + manager_flip_auto_status(m, false, "boot finished"); + + /* Turn off confirm spawn now */ + m->confirm_spawn = NULL; + + /* No need to update ask password status when we're going non-interactive */ + manager_close_ask_password(m); + + /* This is no longer the first boot */ + manager_set_first_boot(m, false); + + dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_FINISH); + + manager_notify_finished(m); + + manager_invalidate_startup_units(m); +} + +void manager_send_reloading(Manager *m) { + assert(m); + + /* Let whoever invoked us know that we are now reloading */ + (void) sd_notifyf(/* unset= */ false, + "RELOADING=1\n" + "MONOTONIC_USEC=" USEC_FMT "\n", now(CLOCK_MONOTONIC)); + + /* And ensure that we'll send READY=1 again as soon as we are ready again */ + m->ready_sent = false; +} + +static bool generator_path_any(const char* const* paths) { + bool found = false; + + /* Optimize by skipping the whole process by not creating output directories + * if no generators are found. */ + STRV_FOREACH(path, paths) + if (access(*path, F_OK) == 0) + found = true; + else if (errno != ENOENT) + log_warning_errno(errno, "Failed to open generator directory %s: %m", *path); + + return found; +} + +static int manager_run_environment_generators(Manager *m) { + char **tmp = NULL; /* this is only used in the forked process, no cleanup here */ + _cleanup_strv_free_ char **paths = NULL; + void* args[] = { + [STDOUT_GENERATE] = &tmp, + [STDOUT_COLLECT] = &tmp, + [STDOUT_CONSUME] = &m->transient_environment, + }; + int r; + + if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_ENV_GENERATORS)) + return 0; + + paths = env_generator_binary_paths(m->runtime_scope); + if (!paths) + return log_oom(); + + if (!generator_path_any((const char* const*) paths)) + return 0; + + WITH_UMASK(0022) + r = execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, gather_environment, + args, NULL, m->transient_environment, + EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID); + return r; +} + +static int build_generator_environment(Manager *m, char ***ret) { + _cleanup_strv_free_ char **nl = NULL; + Virtualization v; + ConfidentialVirtualization cv; + int r; + + assert(m); + assert(ret); + + /* Generators oftentimes want to know some basic facts about the environment they run in, in order to + * adjust generated units to that. Let's pass down some bits of information that are easy for us to + * determine (but a bit harder for generator scripts to determine), as environment variables. */ + + nl = strv_copy(m->transient_environment); + if (!nl) + return -ENOMEM; + + r = strv_env_assign(&nl, "SYSTEMD_SCOPE", runtime_scope_to_string(m->runtime_scope)); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(m)) { + /* Note that $SYSTEMD_IN_INITRD may be used to override the initrd detection in much of our + * codebase. This is hence more than purely informational. It will shortcut detection of the + * initrd state if generators invoke our own tools. But that's OK, as it would come to the + * same results (hopefully). */ + r = strv_env_assign(&nl, "SYSTEMD_IN_INITRD", one_zero(in_initrd())); + if (r < 0) + return r; + + if (m->first_boot >= 0) { + r = strv_env_assign(&nl, "SYSTEMD_FIRST_BOOT", one_zero(m->first_boot)); + if (r < 0) + return r; + } + } + + v = detect_virtualization(); + if (v < 0) + log_debug_errno(v, "Failed to detect virtualization, ignoring: %m"); + else if (v > 0) { + const char *s; + + s = strjoina(VIRTUALIZATION_IS_VM(v) ? "vm:" : + VIRTUALIZATION_IS_CONTAINER(v) ? "container:" : ":", + virtualization_to_string(v)); + + r = strv_env_assign(&nl, "SYSTEMD_VIRTUALIZATION", s); + if (r < 0) + return r; + } + + cv = detect_confidential_virtualization(); + if (cv < 0) + log_debug_errno(cv, "Failed to detect confidential virtualization, ignoring: %m"); + else if (cv > 0) { + r = strv_env_assign(&nl, "SYSTEMD_CONFIDENTIAL_VIRTUALIZATION", confidential_virtualization_to_string(cv)); + if (r < 0) + return r; + } + + r = strv_env_assign(&nl, "SYSTEMD_ARCHITECTURE", architecture_to_string(uname_architecture())); + if (r < 0) + return r; + + *ret = TAKE_PTR(nl); + return 0; +} + +static int manager_execute_generators(Manager *m, char **paths, bool remount_ro) { + _cleanup_strv_free_ char **ge = NULL; + const char *argv[] = { + NULL, /* Leave this empty, execute_directory() will fill something in */ + m->lookup_paths.generator, + m->lookup_paths.generator_early, + m->lookup_paths.generator_late, + NULL, + }; + int r; + + r = build_generator_environment(m, &ge); + if (r < 0) + return log_error_errno(r, "Failed to build generator environment: %m"); + + if (remount_ro) { + /* Remount most of the filesystem tree read-only. We leave /sys/ as-is, because our code + * checks whether it is read-only to detect containerized execution environments. We leave + * /run/ as-is too, because that's where our output goes. We also leave /proc/ and /dev/shm/ + * because they're API, and /tmp/ that safe_fork() mounted for us. + */ + r = bind_remount_recursive("/", MS_RDONLY, MS_RDONLY, + STRV_MAKE("/sys", "/run", "/proc", "/dev/shm", "/tmp")); + if (r < 0) + log_warning_errno(r, "Read-only bind remount failed, ignoring: %m"); + } + + BLOCK_WITH_UMASK(0022); + return execute_directories( + (const char* const*) paths, + DEFAULT_TIMEOUT_USEC, + /* callbacks= */ NULL, /* callback_args= */ NULL, + (char**) argv, + ge, + EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID); +} + +static int manager_run_generators(Manager *m) { + ForkFlags flags = FORK_RESET_SIGNALS | FORK_WAIT | FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE; + _cleanup_strv_free_ char **paths = NULL; + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_GENERATORS)) + return 0; + + paths = generator_binary_paths(m->runtime_scope); + if (!paths) + return log_oom(); + + if (!generator_path_any((const char* const*) paths)) + return 0; + + r = lookup_paths_mkdir_generator(&m->lookup_paths); + if (r < 0) { + log_error_errno(r, "Failed to create generator directories: %m"); + goto finish; + } + + /* If we are the system manager, we fork and invoke the generators in a sanitized mount namespace. If + * we are the user manager, let's just execute the generators directly. We might not have the + * necessary privileges, and the system manager has already mounted /tmp/ and everything else for us. + */ + if (MANAGER_IS_USER(m)) { + r = manager_execute_generators(m, paths, /* remount_ro= */ false); + goto finish; + } + + /* On some systems /tmp/ doesn't exist, and on some other systems we cannot create it at all. Avoid + * trying to mount a private tmpfs on it as there's no one size fits all. */ + if (is_dir("/tmp", /* follow= */ false) > 0) + flags |= FORK_PRIVATE_TMP; + + r = safe_fork("(sd-gens)", flags, NULL); + if (r == 0) { + r = manager_execute_generators(m, paths, /* remount_ro= */ true); + _exit(r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE); + } + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r) && r != -EINVAL) { + log_error_errno(r, "Failed to fork off sandboxing environment for executing generators: %m"); + goto finish; + } + + /* Failed to fork with new mount namespace? Maybe, running in a container environment with + * seccomp or without capability. + * + * We also allow -EINVAL to allow running without CLONE_NEWNS. + * + * Also, when running on non-native userland architecture via systemd-nspawn and + * qemu-user-static QEMU-emulator, clone() with CLONE_NEWNS fails with EINVAL, see + * https://github.com/systemd/systemd/issues/28901. + */ + log_debug_errno(r, + "Failed to fork off sandboxing environment for executing generators. " + "Falling back to execute generators without sandboxing: %m"); + r = manager_execute_generators(m, paths, /* remount_ro= */ false); + } + +finish: + lookup_paths_trim_generator(&m->lookup_paths); + return r; +} + +int manager_transient_environment_add(Manager *m, char **plus) { + char **a; + + assert(m); + + if (strv_isempty(plus)) + return 0; + + a = strv_env_merge(m->transient_environment, plus); + if (!a) + return log_oom(); + + sanitize_environment(a); + + return strv_free_and_replace(m->transient_environment, a); +} + +int manager_client_environment_modify( + Manager *m, + char **minus, + char **plus) { + + char **a = NULL, **b = NULL, **l; + + assert(m); + + if (strv_isempty(minus) && strv_isempty(plus)) + return 0; + + l = m->client_environment; + + if (!strv_isempty(minus)) { + a = strv_env_delete(l, 1, minus); + if (!a) + return -ENOMEM; + + l = a; + } + + if (!strv_isempty(plus)) { + b = strv_env_merge(l, plus); + if (!b) { + strv_free(a); + return -ENOMEM; + } + + l = b; + } + + if (m->client_environment != l) + strv_free(m->client_environment); + + if (a != l) + strv_free(a); + if (b != l) + strv_free(b); + + m->client_environment = sanitize_environment(l); + return 0; +} + +int manager_get_effective_environment(Manager *m, char ***ret) { + char **l; + + assert(m); + assert(ret); + + l = strv_env_merge(m->transient_environment, m->client_environment); + if (!l) + return -ENOMEM; + + *ret = l; + return 0; +} + +int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults) { + _cleanup_free_ char *label = NULL; + struct rlimit *rlimit[_RLIMIT_MAX]; + int r; + + assert(m); + assert(defaults); + + if (streq_ptr(defaults->smack_process_label, "/")) + label = NULL; + else { + const char *l = defaults->smack_process_label; +#ifdef SMACK_DEFAULT_PROCESS_LABEL + if (!l) + l = SMACK_DEFAULT_PROCESS_LABEL; +#endif + if (l) { + label = strdup(l); + if (!label) + return -ENOMEM; + } else + label = NULL; + } + + r = rlimit_copy_all(rlimit, defaults->rlimit); + if (r < 0) + return r; + + m->defaults.std_output = defaults->std_output; + m->defaults.std_error = defaults->std_error; + + m->defaults.restart_usec = defaults->restart_usec; + m->defaults.timeout_start_usec = defaults->timeout_start_usec; + m->defaults.timeout_stop_usec = defaults->timeout_stop_usec; + m->defaults.timeout_abort_usec = defaults->timeout_abort_usec; + m->defaults.timeout_abort_set = defaults->timeout_abort_set; + m->defaults.device_timeout_usec = defaults->device_timeout_usec; + + m->defaults.start_limit_interval = defaults->start_limit_interval; + m->defaults.start_limit_burst = defaults->start_limit_burst; + + m->defaults.cpu_accounting = defaults->cpu_accounting; + m->defaults.memory_accounting = defaults->memory_accounting; + m->defaults.io_accounting = defaults->io_accounting; + m->defaults.blockio_accounting = defaults->blockio_accounting; + m->defaults.tasks_accounting = defaults->tasks_accounting; + m->defaults.ip_accounting = defaults->ip_accounting; + + m->defaults.tasks_max = defaults->tasks_max; + m->defaults.timer_accuracy_usec = defaults->timer_accuracy_usec; + + m->defaults.oom_policy = defaults->oom_policy; + m->defaults.oom_score_adjust = defaults->oom_score_adjust; + m->defaults.oom_score_adjust_set = defaults->oom_score_adjust_set; + + m->defaults.memory_pressure_watch = defaults->memory_pressure_watch; + m->defaults.memory_pressure_threshold_usec = defaults->memory_pressure_threshold_usec; + + free_and_replace(m->defaults.smack_process_label, label); + rlimit_free_all(m->defaults.rlimit); + memcpy(m->defaults.rlimit, rlimit, sizeof(struct rlimit*) * _RLIMIT_MAX); + + return 0; +} + +void manager_recheck_dbus(Manager *m) { + assert(m); + + /* Connects to the bus if the dbus service and socket are running. If we are running in user mode + * this is all it does. In system mode we'll also connect to the system bus (which will most likely + * just reuse the connection of the API bus). That's because the system bus after all runs as service + * of the system instance, while in the user instance we can assume it's already there. */ + + if (MANAGER_IS_RELOADING(m)) + return; /* don't check while we are reloading… */ + + if (manager_dbus_is_running(m, false)) { + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } else { + (void) bus_done_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_done_system(m); + } +} + +static bool manager_journal_is_running(Manager *m) { + Unit *u; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return false; + + /* If we are the user manager we can safely assume that the journal is up */ + if (!MANAGER_IS_SYSTEM(m)) + return true; + + /* Check that the socket is not only up, but in RUNNING state */ + u = manager_get_unit(m, SPECIAL_JOURNALD_SOCKET); + if (!u) + return false; + if (SOCKET(u)->state != SOCKET_RUNNING) + return false; + + /* Similar, check if the daemon itself is fully up, too */ + u = manager_get_unit(m, SPECIAL_JOURNALD_SERVICE); + if (!u) + return false; + if (!IN_SET(SERVICE(u)->state, SERVICE_RELOAD, SERVICE_RUNNING)) + return false; + + return true; +} + +void disable_printk_ratelimit(void) { + /* Disable kernel's printk ratelimit. + * + * Logging to /dev/kmsg is most useful during early boot and shutdown, where normal logging + * mechanisms are not available. The semantics of this sysctl are such that any kernel command-line + * setting takes precedence. */ + int r; + + r = sysctl_write("kernel/printk_devkmsg", "on"); + if (r < 0) + log_debug_errno(r, "Failed to set sysctl kernel.printk_devkmsg=on: %m"); +} + +void manager_recheck_journal(Manager *m) { + + assert(m); + + /* Don't bother with this unless we are in the special situation of being PID 1 */ + if (getpid_cached() != 1) + return; + + /* Don't check this while we are reloading, things might still change */ + if (MANAGER_IS_RELOADING(m)) + return; + + /* The journal is fully and entirely up? If so, let's permit logging to it, if that's configured. If + * the journal is down, don't ever log to it, otherwise we might end up deadlocking ourselves as we + * might trigger an activation ourselves we can't fulfill. */ + log_set_prohibit_ipc(!manager_journal_is_running(m)); + log_open(); +} + +static ShowStatus manager_get_show_status(Manager *m) { + assert(m); + + if (MANAGER_IS_USER(m)) + return _SHOW_STATUS_INVALID; + + if (m->show_status_overridden != _SHOW_STATUS_INVALID) + return m->show_status_overridden; + + return m->show_status; +} + +bool manager_get_show_status_on(Manager *m) { + assert(m); + + return show_status_on(manager_get_show_status(m)); +} + +static void set_show_status_marker(bool b) { + if (b) + (void) touch("/run/systemd/show-status"); + else + (void) unlink("/run/systemd/show-status"); +} + +void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason) { + assert(m); + assert(reason); + assert(mode >= 0 && mode < _SHOW_STATUS_MAX); + + if (MANAGER_IS_USER(m)) + return; + + if (mode == m->show_status) + return; + + if (m->show_status_overridden == _SHOW_STATUS_INVALID) { + bool enabled; + + enabled = show_status_on(mode); + log_debug("%s (%s) showing of status (%s).", + enabled ? "Enabling" : "Disabling", + strna(show_status_to_string(mode)), + reason); + + set_show_status_marker(enabled); + } + + m->show_status = mode; +} + +void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason) { + assert(m); + assert(mode < _SHOW_STATUS_MAX); + + if (MANAGER_IS_USER(m)) + return; + + if (mode == m->show_status_overridden) + return; + + m->show_status_overridden = mode; + + if (mode == _SHOW_STATUS_INVALID) + mode = m->show_status; + + log_debug("%s (%s) showing of status (%s).", + m->show_status_overridden != _SHOW_STATUS_INVALID ? "Overriding" : "Restoring", + strna(show_status_to_string(mode)), + reason); + + set_show_status_marker(show_status_on(mode)); +} + +const char *manager_get_confirm_spawn(Manager *m) { + static int last_errno = 0; + struct stat st; + int r; + + assert(m); + + /* Here's the deal: we want to test the validity of the console but don't want + * PID1 to go through the whole console process which might block. But we also + * want to warn the user only once if something is wrong with the console so we + * cannot do the sanity checks after spawning our children. So here we simply do + * really basic tests to hopefully trap common errors. + * + * If the console suddenly disappear at the time our children will really it + * then they will simply fail to acquire it and a positive answer will be + * assumed. New children will fall back to /dev/console though. + * + * Note: TTYs are devices that can come and go any time, and frequently aren't + * available yet during early boot (consider a USB rs232 dongle...). If for any + * reason the configured console is not ready, we fall back to the default + * console. */ + + if (!m->confirm_spawn || path_equal(m->confirm_spawn, "/dev/console")) + return m->confirm_spawn; + + if (stat(m->confirm_spawn, &st) < 0) { + r = -errno; + goto fail; + } + + if (!S_ISCHR(st.st_mode)) { + r = -ENOTTY; + goto fail; + } + + last_errno = 0; + return m->confirm_spawn; + +fail: + if (last_errno != r) + last_errno = log_warning_errno(r, "Failed to open %s, using default console: %m", m->confirm_spawn); + + return "/dev/console"; +} + +void manager_set_first_boot(Manager *m, bool b) { + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (m->first_boot != (int) b) { + if (b) + (void) touch("/run/systemd/first-boot"); + else + (void) unlink("/run/systemd/first-boot"); + } + + m->first_boot = b; +} + +void manager_disable_confirm_spawn(void) { + (void) touch("/run/systemd/confirm_spawn_disabled"); +} + +static bool manager_should_show_status(Manager *m, StatusType type) { + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return false; + + if (m->no_console_output) + return false; + + if (!IN_SET(manager_state(m), MANAGER_INITIALIZING, MANAGER_STARTING, MANAGER_STOPPING)) + return false; + + /* If we cannot find out the status properly, just proceed. */ + if (type != STATUS_TYPE_EMERGENCY && manager_check_ask_password(m) > 0) + return false; + + if (type == STATUS_TYPE_NOTICE && m->show_status != SHOW_STATUS_NO) + return true; + + return manager_get_show_status_on(m); +} + +void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) { + va_list ap; + + /* If m is NULL, assume we're after shutdown and let the messages through. */ + + if (m && !manager_should_show_status(m, type)) + return; + + /* XXX We should totally drop the check for ephemeral here + * and thus effectively make 'Type=idle' pointless. */ + if (type == STATUS_TYPE_EPHEMERAL && m && m->n_on_console > 0) + return; + + va_start(ap, format); + status_vprintf(status, SHOW_STATUS_ELLIPSIZE|(type == STATUS_TYPE_EPHEMERAL ? SHOW_STATUS_EPHEMERAL : 0), format, ap); + va_end(ap); +} + +Set* manager_get_units_requiring_mounts_for(Manager *m, const char *path) { + assert(m); + assert(path); + + if (path_equal(path, "/")) + path = ""; + + return hashmap_get(m->units_requiring_mounts_for, path); +} + +int manager_update_failed_units(Manager *m, Unit *u, bool failed) { + unsigned size; + int r; + + assert(m); + assert(u->manager == m); + + size = set_size(m->failed_units); + + if (failed) { + r = set_ensure_put(&m->failed_units, NULL, u); + if (r < 0) + return log_oom(); + } else + (void) set_remove(m->failed_units, u); + + if (set_size(m->failed_units) != size) + bus_manager_send_change_signal(m); + + return 0; +} + +ManagerState manager_state(Manager *m) { + Unit *u; + + assert(m); + + /* Is the special shutdown target active or queued? If so, we are in shutdown state */ + u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_STOPPING; + + /* Did we ever finish booting? If not then we are still starting up */ + if (!MANAGER_IS_FINISHED(m)) { + + u = manager_get_unit(m, SPECIAL_BASIC_TARGET); + if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return MANAGER_INITIALIZING; + + return MANAGER_STARTING; + } + + if (MANAGER_IS_SYSTEM(m)) { + /* Are the rescue or emergency targets active or queued? If so we are in maintenance state */ + u = manager_get_unit(m, SPECIAL_RESCUE_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_MAINTENANCE; + + u = manager_get_unit(m, SPECIAL_EMERGENCY_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_MAINTENANCE; + } + + /* Are there any failed units? If so, we are in degraded mode */ + if (set_size(m->failed_units) > 0) + return MANAGER_DEGRADED; + + return MANAGER_RUNNING; +} + +static void manager_unref_uid_internal( + Hashmap *uid_refs, + uid_t uid, + bool destroy_now, + int (*_clean_ipc)(uid_t uid)) { + + uint32_t c, n; + + assert(uid_is_valid(uid)); + assert(_clean_ipc); + + /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the + * assumption that uid_t and gid_t are actually defined the same way, with the same validity rules. + * + * We store a hashmap where the key is the UID/GID and the value is a 32-bit reference counter, whose + * highest bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last + * reference to the UID/GID is dropped. The flag is set to on, once at least one reference from a + * unit where RemoveIPC= is set is added on a UID/GID. It is reset when the UID's/GID's reference + * counter drops to 0 again. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return; + + c = PTR_TO_UINT32(hashmap_get(uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + assert(n > 0); + n--; + + if (destroy_now && n == 0) { + hashmap_remove(uid_refs, UID_TO_PTR(uid)); + + if (c & DESTROY_IPC_FLAG) { + log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + } else { + c = n | (c & DESTROY_IPC_FLAG); + assert_se(hashmap_update(uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0); + } +} + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) { + manager_unref_uid_internal(m->uid_refs, uid, destroy_now, clean_ipc_by_uid); +} + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) { + manager_unref_uid_internal(m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid); +} + +static int manager_ref_uid_internal( + Hashmap **uid_refs, + uid_t uid, + bool clean_ipc) { + + uint32_t c, n; + int r; + + assert(uid_refs); + assert(uid_is_valid(uid)); + + /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the + * assumption that uid_t and gid_t are actually defined the same way, with the same validity + * rules. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return 0; + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) + return r; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + n++; + + if (n & DESTROY_IPC_FLAG) /* check for overflow */ + return -EOVERFLOW; + + c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0); + + return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); +} + +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) { + return manager_ref_uid_internal(&m->uid_refs, uid, clean_ipc); +} + +int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) { + return manager_ref_uid_internal(&m->gid_refs, (uid_t) gid, clean_ipc); +} + +static void manager_vacuum_uid_refs_internal( + Hashmap *uid_refs, + int (*_clean_ipc)(uid_t uid)) { + + void *p, *k; + + assert(_clean_ipc); + + HASHMAP_FOREACH_KEY(p, k, uid_refs) { + uint32_t c, n; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + n = c & ~DESTROY_IPC_FLAG; + if (n > 0) + continue; + + if (c & DESTROY_IPC_FLAG) { + log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + + assert_se(hashmap_remove(uid_refs, k) == p); + } +} + +static void manager_vacuum_uid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m->uid_refs, clean_ipc_by_uid); +} + +static void manager_vacuum_gid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m->gid_refs, clean_ipc_by_gid); +} + +static void manager_vacuum(Manager *m) { + assert(m); + + /* Release any dynamic users no longer referenced */ + dynamic_user_vacuum(m, true); + + /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */ + manager_vacuum_uid_refs(m); + manager_vacuum_gid_refs(m); + + /* Release any runtimes no longer referenced */ + exec_shared_runtime_vacuum(m); +} + +int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + struct buffer { + uid_t uid; + gid_t gid; + char unit_name[UNIT_NAME_MAX+1]; + } _packed_ buffer; + + Manager *m = userdata; + ssize_t l; + size_t n; + Unit *u; + + assert_se(source); + assert_se(m); + + /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the + * resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that + * it can add a reference to the UID/GID so that it can destroy the UID/GID's IPC objects when the + * reference counter drops to 0. */ + + l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return log_error_errno(errno, "Failed to read from user lookup fd: %m"); + } + + if ((size_t) l <= offsetof(struct buffer, unit_name)) { + log_warning("Received too short user lookup message, ignoring."); + return 0; + } + + if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) { + log_warning("Received too long user lookup message, ignoring."); + return 0; + } + + if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) { + log_warning("Got user lookup message with invalid UID/GID pair, ignoring."); + return 0; + } + + n = (size_t) l - offsetof(struct buffer, unit_name); + if (memchr(buffer.unit_name, 0, n)) { + log_warning("Received lookup message with embedded NUL character, ignoring."); + return 0; + } + + buffer.unit_name[n] = 0; + u = manager_get_unit(m, buffer.unit_name); + if (!u) { + log_debug("Got user lookup message but unit doesn't exist, ignoring."); + return 0; + } + + log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid); + + unit_notify_user_lookup(u, buffer.uid, buffer.gid); + return 0; +} + +static int short_uid_range(const char *path) { + _cleanup_(uid_range_freep) UidRange *p = NULL; + int r; + + assert(path); + + /* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534, + * i.e. from root to nobody. */ + + r = uid_range_load_userns(&p, path); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + return false; + if (r < 0) + return log_debug_errno(r, "Failed to load %s: %m", path); + + return !uid_range_covers(p, 0, 65535); +} + +char* manager_taint_string(const Manager *m) { + /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at + * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the + * configuration phase. */ + + assert(m); + + const char* stage[12] = {}; + size_t n = 0; + + _cleanup_free_ char *usrbin = NULL; + if (readlink_malloc("/bin", &usrbin) < 0 || !PATH_IN_SET(usrbin, "usr/bin", "/usr/bin")) + stage[n++] = "unmerged-usr"; + + if (access("/proc/cgroups", F_OK) < 0) + stage[n++] = "cgroups-missing"; + + if (cg_all_unified() == 0) + stage[n++] = "cgroupsv1"; + + if (clock_is_localtime(NULL) > 0) + stage[n++] = "local-hwclock"; + + if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0) + stage[n++] = "support-ended"; + + _cleanup_free_ char *destination = NULL; + if (readlink_malloc("/var/run", &destination) < 0 || + !PATH_IN_SET(destination, "../run", "/run")) + stage[n++] = "var-run-bad"; + + _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL; + if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 && + !streq(overflowuid, "65534")) + stage[n++] = "overflowuid-not-65534"; + if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 && + !streq(overflowgid, "65534")) + stage[n++] = "overflowgid-not-65534"; + + struct utsname uts; + assert_se(uname(&uts) >= 0); + if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0) + stage[n++] = "old-kernel"; + + if (short_uid_range("/proc/self/uid_map") > 0) + stage[n++] = "short-uid-range"; + if (short_uid_range("/proc/self/gid_map") > 0) + stage[n++] = "short-gid-range"; + + assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */ + + return strv_join((char**) stage, ":"); +} + +void manager_ref_console(Manager *m) { + assert(m); + + m->n_on_console++; +} + +void manager_unref_console(Manager *m) { + + assert(m->n_on_console > 0); + m->n_on_console--; + + if (m->n_on_console == 0) + m->no_console_output = false; /* unset no_console_output flag, since the console is definitely free now */ +} + +void manager_override_log_level(Manager *m, int level) { + _cleanup_free_ char *s = NULL; + assert(m); + + if (!m->log_level_overridden) { + m->original_log_level = log_get_max_level(); + m->log_level_overridden = true; + } + + (void) log_level_to_string_alloc(level, &s); + log_info("Setting log level to %s.", strna(s)); + + log_set_max_level(level); +} + +void manager_restore_original_log_level(Manager *m) { + _cleanup_free_ char *s = NULL; + assert(m); + + if (!m->log_level_overridden) + return; + + (void) log_level_to_string_alloc(m->original_log_level, &s); + log_info("Restoring log level to original (%s).", strna(s)); + + log_set_max_level(m->original_log_level); + m->log_level_overridden = false; +} + +void manager_override_log_target(Manager *m, LogTarget target) { + assert(m); + + if (!m->log_target_overridden) { + m->original_log_target = log_get_target(); + m->log_target_overridden = true; + } + + log_info("Setting log target to %s.", log_target_to_string(target)); + log_set_target(target); +} + +void manager_restore_original_log_target(Manager *m) { + assert(m); + + if (!m->log_target_overridden) + return; + + log_info("Restoring log target to original %s.", log_target_to_string(m->original_log_target)); + + log_set_target(m->original_log_target); + m->log_target_overridden = false; +} + +ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s) { + if (in_initrd() && + s >= MANAGER_TIMESTAMP_SECURITY_START && + s <= MANAGER_TIMESTAMP_UNITS_LOAD_FINISH) + return s - MANAGER_TIMESTAMP_SECURITY_START + MANAGER_TIMESTAMP_INITRD_SECURITY_START; + return s; +} + +int manager_allocate_idle_pipe(Manager *m) { + int r; + + assert(m); + + if (m->idle_pipe[0] >= 0) { + assert(m->idle_pipe[1] >= 0); + assert(m->idle_pipe[2] >= 0); + assert(m->idle_pipe[3] >= 0); + return 0; + } + + assert(m->idle_pipe[1] < 0); + assert(m->idle_pipe[2] < 0); + assert(m->idle_pipe[3] < 0); + + r = RET_NERRNO(pipe2(m->idle_pipe + 0, O_NONBLOCK|O_CLOEXEC)); + if (r < 0) + return r; + + r = RET_NERRNO(pipe2(m->idle_pipe + 2, O_NONBLOCK|O_CLOEXEC)); + if (r < 0) { + safe_close_pair(m->idle_pipe + 0); + return r; + } + + return 1; +} + +void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) { + assert(defaults); + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + *defaults = (UnitDefaults) { + .std_output = EXEC_OUTPUT_JOURNAL, + .std_error = EXEC_OUTPUT_INHERIT, + .restart_usec = DEFAULT_RESTART_USEC, + .timeout_start_usec = manager_default_timeout(scope), + .timeout_stop_usec = manager_default_timeout(scope), + .timeout_abort_usec = manager_default_timeout(scope), + .timeout_abort_set = false, + .device_timeout_usec = manager_default_timeout(scope), + .start_limit_interval = DEFAULT_START_LIMIT_INTERVAL, + .start_limit_burst = DEFAULT_START_LIMIT_BURST, + + /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU + * controller to be enabled, so the default is to enable it unless we got told otherwise. */ + .cpu_accounting = cpu_accounting_is_cheap(), + .memory_accounting = MEMORY_ACCOUNTING_DEFAULT, + .io_accounting = false, + .blockio_accounting = false, + .tasks_accounting = true, + .ip_accounting = false, + + .tasks_max = DEFAULT_TASKS_MAX, + .timer_accuracy_usec = 1 * USEC_PER_MINUTE, + + .memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO, + .memory_pressure_threshold_usec = MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC, + + .oom_policy = OOM_STOP, + .oom_score_adjust_set = false, + }; +} + +void unit_defaults_done(UnitDefaults *defaults) { + assert(defaults); + + defaults->smack_process_label = mfree(defaults->smack_process_label); + rlimit_free_all(defaults->rlimit); +} + +LogTarget manager_get_executor_log_target(Manager *m) { + assert(m); + + /* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */ + + if (manager_journal_is_running(m)) + return log_get_target(); + + return LOG_TARGET_KMSG; +} + +static const char *const manager_state_table[_MANAGER_STATE_MAX] = { + [MANAGER_INITIALIZING] = "initializing", + [MANAGER_STARTING] = "starting", + [MANAGER_RUNNING] = "running", + [MANAGER_DEGRADED] = "degraded", + [MANAGER_MAINTENANCE] = "maintenance", + [MANAGER_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState); + +static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { + [MANAGER_TIMESTAMP_FIRMWARE] = "firmware", + [MANAGER_TIMESTAMP_LOADER] = "loader", + [MANAGER_TIMESTAMP_KERNEL] = "kernel", + [MANAGER_TIMESTAMP_INITRD] = "initrd", + [MANAGER_TIMESTAMP_USERSPACE] = "userspace", + [MANAGER_TIMESTAMP_FINISH] = "finish", + [MANAGER_TIMESTAMP_SECURITY_START] = "security-start", + [MANAGER_TIMESTAMP_SECURITY_FINISH] = "security-finish", + [MANAGER_TIMESTAMP_GENERATORS_START] = "generators-start", + [MANAGER_TIMESTAMP_GENERATORS_FINISH] = "generators-finish", + [MANAGER_TIMESTAMP_UNITS_LOAD_START] = "units-load-start", + [MANAGER_TIMESTAMP_UNITS_LOAD_FINISH] = "units-load-finish", + [MANAGER_TIMESTAMP_UNITS_LOAD] = "units-load", + [MANAGER_TIMESTAMP_INITRD_SECURITY_START] = "initrd-security-start", + [MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH] = "initrd-security-finish", + [MANAGER_TIMESTAMP_INITRD_GENERATORS_START] = "initrd-generators-start", + [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish", + [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start", + [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp); + +static const char* const oom_policy_table[_OOM_POLICY_MAX] = { + [OOM_CONTINUE] = "continue", + [OOM_STOP] = "stop", + [OOM_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy); diff --git a/src/core/manager.h b/src/core/manager.h new file mode 100644 index 0000000..d96eb7b --- /dev/null +++ b/src/core/manager.h @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-event.h" + +#include "common-signal.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "fdset.h" +#include "hashmap.h" +#include "list.h" +#include "prioq.h" +#include "ratelimit.h" +#include "varlink.h" + +struct libmnt_monitor; +typedef struct Unit Unit; + +/* Enforce upper limit how many names we allow */ +#define MANAGER_MAX_NAMES 131072 /* 128K */ + +/* On sigrtmin+18, private commands */ +enum { + MANAGER_SIGNAL_COMMAND_DUMP_JOBS = _COMMON_SIGNAL_COMMAND_PRIVATE_BASE + 0, + _MANAGER_SIGNAL_COMMAND_MAX, +}; + +assert_cc((int) _MANAGER_SIGNAL_COMMAND_MAX <= (int) _COMMON_SIGNAL_COMMAND_PRIVATE_END); + +typedef struct Manager Manager; + +/* An externally visible state. We don't actually maintain this as state variable, but derive it from various fields + * when requested */ +typedef enum ManagerState { + MANAGER_INITIALIZING, + MANAGER_STARTING, + MANAGER_RUNNING, + MANAGER_DEGRADED, + MANAGER_MAINTENANCE, + MANAGER_STOPPING, + _MANAGER_STATE_MAX, + _MANAGER_STATE_INVALID = -EINVAL, +} ManagerState; + +typedef enum ManagerObjective { + MANAGER_OK, + MANAGER_EXIT, + MANAGER_RELOAD, + MANAGER_REEXECUTE, + MANAGER_REBOOT, + MANAGER_SOFT_REBOOT, + MANAGER_POWEROFF, + MANAGER_HALT, + MANAGER_KEXEC, + MANAGER_SWITCH_ROOT, + _MANAGER_OBJECTIVE_MAX, + _MANAGER_OBJECTIVE_INVALID = -EINVAL, +} ManagerObjective; + +typedef enum StatusType { + STATUS_TYPE_EPHEMERAL, + STATUS_TYPE_NORMAL, + STATUS_TYPE_NOTICE, + STATUS_TYPE_EMERGENCY, +} StatusType; + +typedef enum OOMPolicy { + OOM_CONTINUE, /* The kernel or systemd-oomd kills the process it wants to kill, and that's it */ + OOM_STOP, /* The kernel or systemd-oomd kills the process it wants to kill, and we stop the unit */ + OOM_KILL, /* The kernel or systemd-oomd kills the process it wants to kill, and all others in the unit, and we stop the unit */ + _OOM_POLICY_MAX, + _OOM_POLICY_INVALID = -EINVAL, +} OOMPolicy; + +/* Notes: + * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD, + * TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when + * the manager is system and not running under container environment. + * + * 2. The monotonic timestamp of TIMESTAMP_KERNEL is always zero. + * + * 3. The realtime timestamp of TIMESTAMP_KERNEL will be unset if the system does not + * have RTC. + * + * 4. TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER will be unset if the system does not + * have RTC, or systemd is built without EFI support. + * + * 5. The monotonic timestamps of TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER are stored as + * negative of the actual value. + * + * 6. TIMESTAMP_USERSPACE is the timestamp of when the manager was started. + * + * 7. TIMESTAMP_INITRD_* are set only when the system is booted with an initrd. + */ + +typedef enum ManagerTimestamp { + MANAGER_TIMESTAMP_FIRMWARE, + MANAGER_TIMESTAMP_LOADER, + MANAGER_TIMESTAMP_KERNEL, + MANAGER_TIMESTAMP_INITRD, + MANAGER_TIMESTAMP_USERSPACE, + MANAGER_TIMESTAMP_FINISH, + + MANAGER_TIMESTAMP_SECURITY_START, + MANAGER_TIMESTAMP_SECURITY_FINISH, + MANAGER_TIMESTAMP_GENERATORS_START, + MANAGER_TIMESTAMP_GENERATORS_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD_START, + MANAGER_TIMESTAMP_UNITS_LOAD_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD, + + MANAGER_TIMESTAMP_INITRD_SECURITY_START, + MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH, + MANAGER_TIMESTAMP_INITRD_GENERATORS_START, + MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH, + MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START, + MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH, + _MANAGER_TIMESTAMP_MAX, + _MANAGER_TIMESTAMP_INVALID = -EINVAL, +} ManagerTimestamp; + +typedef enum WatchdogType { + WATCHDOG_RUNTIME, + WATCHDOG_REBOOT, + WATCHDOG_KEXEC, + WATCHDOG_PRETIMEOUT, + _WATCHDOG_TYPE_MAX, +} WatchdogType; + +#include "execute.h" +#include "job.h" +#include "path-lookup.h" +#include "show-status.h" +#include "unit-name.h" + +typedef enum ManagerTestRunFlags { + MANAGER_TEST_NORMAL = 0, /* run normally */ + MANAGER_TEST_RUN_MINIMAL = 1 << 0, /* create basic data structures */ + MANAGER_TEST_RUN_BASIC = 1 << 1, /* interact with the environment */ + MANAGER_TEST_RUN_ENV_GENERATORS = 1 << 2, /* also run env generators */ + MANAGER_TEST_RUN_GENERATORS = 1 << 3, /* also run unit generators */ + MANAGER_TEST_RUN_IGNORE_DEPENDENCIES = 1 << 4, /* run while ignoring dependencies */ + MANAGER_TEST_DONT_OPEN_EXECUTOR = 1 << 5, /* avoid trying to load sd-executor */ + MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS, +} ManagerTestRunFlags; + +assert_cc((MANAGER_TEST_FULL & UINT8_MAX) == MANAGER_TEST_FULL); + +/* Various defaults for unit file settings. */ +typedef struct UnitDefaults { + ExecOutput std_output, std_error; + + usec_t restart_usec, timeout_start_usec, timeout_stop_usec, timeout_abort_usec, device_timeout_usec; + bool timeout_abort_set; + + usec_t start_limit_interval; + unsigned start_limit_burst; + + bool cpu_accounting; + bool memory_accounting; + bool io_accounting; + bool blockio_accounting; + bool tasks_accounting; + bool ip_accounting; + + CGroupTasksMax tasks_max; + usec_t timer_accuracy_usec; + + OOMPolicy oom_policy; + int oom_score_adjust; + bool oom_score_adjust_set; + + CGroupPressureWatch memory_pressure_watch; + usec_t memory_pressure_threshold_usec; + + char *smack_process_label; + + struct rlimit *rlimit[_RLIMIT_MAX]; +} UnitDefaults; + +struct Manager { + /* Note that the set of units we know of is allowed to be + * inconsistent. However the subset of it that is loaded may + * not, and the list of jobs may neither. */ + + /* Active jobs and units */ + Hashmap *units; /* name string => Unit object n:1 */ + Hashmap *units_by_invocation_id; + Hashmap *jobs; /* job id => Job object 1:1 */ + + /* To make it easy to iterate through the units of a specific + * type we maintain a per type linked list */ + LIST_HEAD(Unit, units_by_type[_UNIT_TYPE_MAX]); + + /* Units that need to be loaded */ + LIST_HEAD(Unit, load_queue); /* this is actually more a stack than a queue, but uh. */ + + /* Jobs that need to be run */ + struct Prioq *run_queue; + + /* Units and jobs that have not yet been announced via + * D-Bus. When something about a job changes it is added here + * if it is not in there yet. This allows easy coalescing of + * D-Bus change signals. */ + LIST_HEAD(Unit, dbus_unit_queue); + LIST_HEAD(Job, dbus_job_queue); + + /* Units to remove */ + LIST_HEAD(Unit, cleanup_queue); + + /* Units and jobs to check when doing GC */ + LIST_HEAD(Unit, gc_unit_queue); + LIST_HEAD(Job, gc_job_queue); + + /* Units that should be realized */ + LIST_HEAD(Unit, cgroup_realize_queue); + + /* Units whose cgroup ran empty */ + LIST_HEAD(Unit, cgroup_empty_queue); + + /* Units whose memory.event fired */ + LIST_HEAD(Unit, cgroup_oom_queue); + + /* Target units whose default target dependencies haven't been set yet */ + LIST_HEAD(Unit, target_deps_queue); + + /* Units that might be subject to StopWhenUnneeded= clean-up */ + LIST_HEAD(Unit, stop_when_unneeded_queue); + + /* Units which are upheld by another other which we might need to act on */ + LIST_HEAD(Unit, start_when_upheld_queue); + + /* Units that have BindsTo= another unit, and might need to be shutdown because the bound unit is not active. */ + LIST_HEAD(Unit, stop_when_bound_queue); + + /* Units that have resources open, and where it might be good to check if they can be released now */ + LIST_HEAD(Unit, release_resources_queue); + + sd_event *event; + + /* This maps PIDs we care about to units that are interested in them. We allow multiple units to be + * interested in the same PID and multiple PIDs to be relevant to the same unit. Since in most cases + * only a single unit will be interested in the same PID though, we use a somewhat special structure + * here: the first unit interested in a PID is stored in the hashmap 'watch_pids', keyed by the + * PID. If there are other units interested too they'll be stored in a NULL-terminated array, stored + * in the hashmap 'watch_pids_more', keyed by the PID. Thus to go through the full list of units + * interested in a PID we must look into both hashmaps. */ + Hashmap *watch_pids; /* PidRef* → Unit* */ + Hashmap *watch_pids_more; /* PidRef* → NUL terminated array of Unit* */ + + /* A set contains all units which cgroup should be refreshed after startup */ + Set *startup_units; + + /* A set which contains all currently failed units */ + Set *failed_units; + + sd_event_source *run_queue_event_source; + + char *notify_socket; + int notify_fd; + sd_event_source *notify_event_source; + + int cgroups_agent_fd; + sd_event_source *cgroups_agent_event_source; + + int signal_fd; + sd_event_source *signal_event_source; + + sd_event_source *sigchld_event_source; + + sd_event_source *time_change_event_source; + + sd_event_source *timezone_change_event_source; + + sd_event_source *jobs_in_progress_event_source; + + int user_lookup_fds[2]; + sd_event_source *user_lookup_event_source; + + RuntimeScope runtime_scope; + + LookupPaths lookup_paths; + Hashmap *unit_id_map; + Hashmap *unit_name_map; + Set *unit_path_cache; + uint64_t unit_cache_timestamp_hash; + + /* We don't have support for atomically enabling/disabling units, and unit_file_state might become + * outdated if such operations failed half-way. Therefore, we set this flag if changes to unit files + * are made, and reset it after daemon-reload. If set, we report that daemon-reload is needed through + * unit's NeedDaemonReload property. */ + bool unit_file_state_outdated; + + char **transient_environment; /* The environment, as determined from config files, kernel cmdline and environment generators */ + char **client_environment; /* Environment variables created by clients through the bus API */ + + usec_t watchdog[_WATCHDOG_TYPE_MAX]; + usec_t watchdog_overridden[_WATCHDOG_TYPE_MAX]; + char *watchdog_pretimeout_governor; + char *watchdog_pretimeout_governor_overridden; + + dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX]; + + /* Data specific to the device subsystem */ + sd_device_monitor *device_monitor; + Hashmap *devices_by_sysfs; + + /* Data specific to the mount subsystem */ + struct libmnt_monitor *mount_monitor; + sd_event_source *mount_event_source; + + /* Data specific to the swap filesystem */ + FILE *proc_swaps; + sd_event_source *swap_event_source; + Hashmap *swaps_by_devnode; + + /* Data specific to the D-Bus subsystem */ + sd_bus *api_bus, *system_bus; + Set *private_buses; + int private_listen_fd; + sd_event_source *private_listen_event_source; + + /* Contains all the clients that are subscribed to signals via + the API bus. Note that private bus connections are always + considered subscribes, since they last for very short only, + and it is much simpler that way. */ + sd_bus_track *subscribed; + char **deserialized_subscribed; + + /* This is used during reloading: before the reload we queue + * the reply message here, and afterwards we send it */ + sd_bus_message *pending_reload_message; + + Hashmap *watch_bus; /* D-Bus names => Unit object n:1 */ + + bool send_reloading_done; + + uint32_t current_job_id; + uint32_t default_unit_job_id; + + /* Data specific to the Automount subsystem */ + int dev_autofs_fd; + + /* Data specific to the cgroup subsystem */ + Hashmap *cgroup_unit; + CGroupMask cgroup_supported; + char *cgroup_root; + + /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */ + int cgroup_inotify_fd; + sd_event_source *cgroup_inotify_event_source; + + /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and + * memory.events cgroupv2 attributes. */ + Hashmap *cgroup_control_inotify_wd_unit; + Hashmap *cgroup_memory_inotify_wd_unit; + + /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */ + sd_event_source *cgroup_empty_event_source; + sd_event_source *cgroup_oom_event_source; + + /* Make sure the user cannot accidentally unmount our cgroup + * file system */ + int pin_cgroupfs_fd; + + unsigned gc_marker; + + /* The stat() data the last time we saw /etc/localtime */ + usec_t etc_localtime_mtime; + bool etc_localtime_accessible; + + ManagerObjective objective; + + /* Flags */ + bool dispatching_load_queue; + + /* Have we already sent out the READY=1 notification? */ + bool ready_sent; + + /* Was the last status sent "STATUS=Ready."? */ + bool status_ready; + + /* Have we already printed the taint line if necessary? */ + bool taint_logged; + + /* Have we ever changed the "kernel.pid_max" sysctl? */ + bool sysctl_pid_max_changed; + + ManagerTestRunFlags test_run_flags; + + /* If non-zero, exit with the following value when the systemd + * process terminate. Useful for containers: systemd-nspawn could get + * the return value. */ + uint8_t return_value; + + ShowStatus show_status; + ShowStatus show_status_overridden; + StatusUnitFormat status_unit_format; + char *confirm_spawn; + bool no_console_output; + bool service_watchdogs; + + UnitDefaults defaults; + + int original_log_level; + LogTarget original_log_target; + bool log_level_overridden; + bool log_target_overridden; + + /* non-zero if we are reloading or reexecuting, */ + int n_reloading; + + unsigned n_installed_jobs; + unsigned n_failed_jobs; + + /* Jobs in progress watching */ + unsigned n_running_jobs; + unsigned n_on_console; + unsigned jobs_in_progress_iteration; + + /* Do we have any outstanding password prompts? */ + int have_ask_password; + int ask_password_inotify_fd; + sd_event_source *ask_password_event_source; + + /* Type=idle pipes */ + int idle_pipe[4]; + sd_event_source *idle_pipe_event_source; + + char *switch_root; + char *switch_root_init; + + /* This is true before and after switching root. */ + bool switching_root; + + /* This maps all possible path prefixes to the units needing + * them. It's a hashmap with a path string as key and a Set as + * value where Unit objects are contained. */ + Hashmap *units_requiring_mounts_for; + + /* Used for processing polkit authorization responses */ + Hashmap *polkit_registry; + + /* Dynamic users/groups, indexed by their name */ + Hashmap *dynamic_users; + + /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */ + Hashmap *uid_refs; + Hashmap *gid_refs; + + /* ExecSharedRuntime, indexed by their owner unit id */ + Hashmap *exec_shared_runtime_by_id; + + /* When the user hits C-A-D more than 7 times per 2s, do something immediately... */ + RateLimit ctrl_alt_del_ratelimit; + EmergencyAction cad_burst_action; + + const char *unit_log_field; + const char *unit_log_format_string; + + const char *invocation_log_field; + const char *invocation_log_format_string; + + int first_boot; /* tri-state */ + + /* Prefixes of e.g. RuntimeDirectory= */ + char *prefix[_EXEC_DIRECTORY_TYPE_MAX]; + char *received_credentials_directory; + char *received_encrypted_credentials_directory; + + /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; + + VarlinkServer *varlink_server; + /* When we're a system manager, this object manages the subscription from systemd-oomd to PID1 that's + * used to report changes in ManagedOOM settings (systemd server - oomd client). When + * we're a user manager, this object manages the client connection from the user manager to + * systemd-oomd to report changes in ManagedOOM settings (systemd client - oomd server). */ + Varlink *managed_oom_varlink; + + /* Reference to RestrictFileSystems= BPF program */ + struct restrict_fs_bpf *restrict_fs; + + /* Allow users to configure a rate limit for Reload() operations */ + RateLimit reload_ratelimit; + /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */ + RateLimit dump_ratelimit; + + sd_event_source *memory_pressure_event_source; + + /* For NFTSet= */ + FirewallContext *fw_ctx; + + /* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have + * serialization/deserialization compatibility issues during upgrades. */ + int executor_fd; +}; + +static inline usec_t manager_default_timeout_abort_usec(Manager *m) { + assert(m); + return m->defaults.timeout_abort_set ? m->defaults.timeout_abort_usec : m->defaults.timeout_stop_usec; +} + +#define MANAGER_IS_SYSTEM(m) ((m)->runtime_scope == RUNTIME_SCOPE_SYSTEM) +#define MANAGER_IS_USER(m) ((m)->runtime_scope == RUNTIME_SCOPE_USER) + +#define MANAGER_IS_RELOADING(m) ((m)->n_reloading > 0) + +#define MANAGER_IS_FINISHED(m) (dual_timestamp_is_set((m)->timestamps + MANAGER_TIMESTAMP_FINISH)) + +/* The objective is set to OK as soon as we enter the main loop, and set otherwise as soon as we are done with it */ +#define MANAGER_IS_RUNNING(m) ((m)->objective == MANAGER_OK) + +#define MANAGER_IS_SWITCHING_ROOT(m) ((m)->switching_root) + +#define MANAGER_IS_TEST_RUN(m) ((m)->test_run_flags != 0) + +static inline usec_t manager_default_timeout(RuntimeScope scope) { + return scope == RUNTIME_SCOPE_SYSTEM ? DEFAULT_TIMEOUT_USEC : DEFAULT_USER_TIMEOUT_USEC; +} + +int manager_new(RuntimeScope scope, ManagerTestRunFlags test_run_flags, Manager **m); +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root); + +Job *manager_get_job(Manager *m, uint32_t id); +Unit *manager_get_unit(Manager *m, const char *name); + +int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j); + +bool manager_unit_cache_should_retry_load(Unit *u); +int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret); +int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret); +int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret); +int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u); + +int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret); +int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret); +int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret); +int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e); + +void manager_clear_jobs(Manager *m); + +void manager_unwatch_pidref(Manager *m, PidRef *pid); + +unsigned manager_dispatch_load_queue(Manager *m); + +int manager_setup_memory_pressure_event_source(Manager *m); + +int manager_default_environment(Manager *m); +int manager_transient_environment_add(Manager *m, char **plus); +int manager_client_environment_modify(Manager *m, char **minus, char **plus); +int manager_get_effective_environment(Manager *m, char ***ret); + +int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults); + +void manager_trigger_run_queue(Manager *m); + +int manager_loop(Manager *m); + +int manager_reload(Manager *m); +Manager* manager_reloading_start(Manager *m); +void manager_reloading_stopp(Manager **m); + +void manager_reset_failed(Manager *m); + +void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success); +void manager_send_unit_plymouth(Manager *m, Unit *u); + +bool manager_unit_inactive_or_pending(Manager *m, const char *name); + +void manager_check_finished(Manager *m); +void manager_send_reloading(Manager *m); + +void disable_printk_ratelimit(void); +void manager_recheck_dbus(Manager *m); +void manager_recheck_journal(Manager *m); + +bool manager_get_show_status_on(Manager *m); +void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason); +void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason); + +void manager_set_first_boot(Manager *m, bool b); +void manager_set_switching_root(Manager *m, bool switching_root); + +double manager_get_progress(Manager *m); + +void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5); + +Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path); + +ManagerState manager_state(Manager *m); + +int manager_update_failed_units(Manager *m, Unit *u, bool failed); + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now); +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc); + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now); +int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc); + +char* manager_taint_string(const Manager *m); + +void manager_ref_console(Manager *m); +void manager_unref_console(Manager *m); + +void manager_override_log_level(Manager *m, int level); +void manager_restore_original_log_level(Manager *m); + +void manager_override_log_target(Manager *m, LogTarget target); +void manager_restore_original_log_target(Manager *m); + +const char *manager_state_to_string(ManagerState m) _const_; +ManagerState manager_state_from_string(const char *s) _pure_; + +const char *manager_get_confirm_spawn(Manager *m); +void manager_disable_confirm_spawn(void); + +const char *manager_timestamp_to_string(ManagerTimestamp m) _const_; +ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_; +ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s); + +usec_t manager_get_watchdog(Manager *m, WatchdogType t); +void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout); +void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout); +int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor); +int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor); + +LogTarget manager_get_executor_log_target(Manager *m); + +int manager_allocate_idle_pipe(Manager *m); + +const char* oom_policy_to_string(OOMPolicy i) _const_; +OOMPolicy oom_policy_from_string(const char *s) _pure_; + +void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope); +void unit_defaults_done(UnitDefaults *defaults); diff --git a/src/core/meson.build b/src/core/meson.build new file mode 100644 index 0000000..7701d3d --- /dev/null +++ b/src/core/meson.build @@ -0,0 +1,260 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libcore_sources = files( + 'apparmor-setup.c', + 'audit-fd.c', + 'automount.c', + 'bpf-devices.c', + 'bpf-firewall.c', + 'bpf-foreign.c', + 'bpf-lsm.c', + 'bpf-socket-bind.c', + 'cgroup.c', + 'core-varlink.c', + 'dbus-automount.c', + 'dbus-cgroup.c', + 'dbus-device.c', + 'dbus-execute.c', + 'dbus-job.c', + 'dbus-kill.c', + 'dbus-manager.c', + 'dbus-mount.c', + 'dbus-path.c', + 'dbus-scope.c', + 'dbus-service.c', + 'dbus-slice.c', + 'dbus-socket.c', + 'dbus-swap.c', + 'dbus-target.c', + 'dbus-timer.c', + 'dbus-unit.c', + 'dbus-util.c', + 'dbus.c', + 'device.c', + 'dynamic-user.c', + 'efi-random.c', + 'emergency-action.c', + 'exec-credential.c', + 'execute.c', + 'execute-serialize.c', + 'generator-setup.c', + 'ima-setup.c', + 'import-creds.c', + 'job.c', + 'kill.c', + 'kmod-setup.c', + 'load-dropin.c', + 'load-fragment.c', + 'manager-dump.c', + 'manager-serialize.c', + 'manager.c', + 'mount.c', + 'namespace.c', + 'path.c', + 'restrict-ifaces.c', + 'scope.c', + 'selinux-access.c', + 'selinux-setup.c', + 'service.c', + 'show-status.c', + 'slice.c', + 'smack-setup.c', + 'socket.c', + 'swap.c', + 'target.c', + 'timer.c', + 'transaction.c', + 'unit-dependency-atom.c', + 'unit-printf.c', + 'unit-serialize.c', + 'unit.c', +) + +if conf.get('BPF_FRAMEWORK') == 1 + libcore_sources += files( + 'bpf-util.c', + ) +endif + +subdir('bpf/socket_bind') +subdir('bpf/restrict_fs') +subdir('bpf/restrict_ifaces') + +if conf.get('BPF_FRAMEWORK') == 1 + libcore_sources += [ + socket_bind_skel_h, + restrict_fs_skel_h, + restrict_ifaces_skel_h] +endif + +load_fragment_gperf_gperf = custom_target( + 'load-fragment-gperf.gperf', + input : 'load-fragment-gperf.gperf.in', + output: 'load-fragment-gperf.gperf', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@']) + +load_fragment_gperf_c = custom_target( + 'load-fragment-gperf.c', + input : load_fragment_gperf_gperf, + output : 'load-fragment-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +awkscript = 'load-fragment-gperf-nulstr.awk' +load_fragment_gperf_nulstr_c = custom_target( + 'load-fragment-gperf-nulstr.c', + input : [awkscript, load_fragment_gperf_gperf], + output : 'load-fragment-gperf-nulstr.c', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +libcore_name = 'systemd-core-@0@'.format(shared_lib_tag) + +libcore = shared_library( + libcore_name, + libcore_sources, + load_fragment_gperf_c, + load_fragment_gperf_nulstr_c, + include_directories : includes, + c_args : ['-fvisibility=default'], + link_args : ['-shared', + '-Wl,--version-script=' + libshared_sym_path], + link_depends : libshared_sym_path, + link_with : libshared, + dependencies : [libacl, + libapparmor, + libaudit, + libblkid, + libdl, + libkmod, + libm, + libmount, + libpam, + librt, + libseccomp, + libselinux, + threads, + userspace], + install : true, + install_dir : pkglibdir) + +core_includes = [includes, include_directories('.')] + +systemd_sources = files( + 'main.c', + 'crash-handler.c', +) + +systemd_executor_sources = files( + 'executor.c', + 'exec-invoke.c', +) + +executables += [ + libexec_template + { + 'name' : 'systemd', + 'dbus' : true, + 'public' : true, + 'sources' : systemd_sources, + 'link_with' : [ + libcore, + libshared, + ], + 'dependencies' : libseccomp, + }, + libexec_template + { + 'name' : 'systemd-executor', + 'public' : true, + 'sources' : systemd_executor_sources, + 'include_directories' : core_includes, + 'link_with' : [ + libcore, + libshared, + ], + 'dependencies' : [ + libapparmor, + libpam, + libseccomp, + libselinux, + ], + }, + fuzz_template + { + 'sources' : files('fuzz-unit-file.c'), + 'link_with' : [ + libcore, + libshared + ], + 'dependencies' : libmount, + }, + fuzz_template + { + 'sources' : files('fuzz-manager-serialize.c'), + 'link_with' : [ + libcore, + libshared + ], + }, + fuzz_template + { + 'sources' : files('fuzz-execute-serialize.c'), + 'link_with' : [ + libcore, + libshared + ], + }, +] + +in_files = [['system.conf', pkgconfigfiledir], + ['user.conf', pkgconfigfiledir], + ['org.freedesktop.systemd1.policy', polkitpolicydir]] + +foreach item : in_files + file = item[0] + dir = item[1] + + custom_target( + file, + input : file + '.in', + output: file, + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : (dir == pkgconfigfiledir) ? install_sysconfdir_samples : (dir != 'no'), + install_dir : dir) +endforeach + +systemd_pc = custom_target( + 'systemd.pc', + input : 'systemd.pc.in', + output : 'systemd.pc', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : pkgconfigdatadir != 'no', + install_tag : 'devel', + install_dir : pkgconfigdatadir) + +install_data('org.freedesktop.systemd1.conf', + install_dir : dbuspolicydir) +install_data('org.freedesktop.systemd1.service', + install_dir : dbussystemservicedir) + +install_emptydir(systemshutdowndir) +install_emptydir(systemsleepdir) +install_emptydir(systemgeneratordir) +install_emptydir(usergeneratordir) + +if install_sysconfdir + install_emptydir(pkgsysconfdir / 'system') + install_emptydir(pkgsysconfdir / 'user') + install_emptydir(sysconfdir / 'xdg/systemd') + meson.add_install_script(sh, '-c', ln_s.format(pkgsysconfdir / 'user', + sysconfdir / 'xdg/systemd/user')) +endif + +install_emptydir(sbindir) +meson.add_install_script(sh, '-c', ln_s.format(libexecdir / 'systemd', sbindir / 'init')) + +############################################################ + +core_test_template = test_template + { + 'link_with' : [ + libcore, + libshared, + ], + 'include_directories' : core_includes, + 'suite' : 'core', +} diff --git a/src/core/mount.c b/src/core/mount.c new file mode 100644 index 0000000..ded322d --- /dev/null +++ b/src/core/mount.c @@ -0,0 +1,2502 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "dbus-mount.h" +#include "dbus-unit.h" +#include "device.h" +#include "exit-status.h" +#include "format-util.h" +#include "fs-util.h" +#include "fstab-util.h" +#include "initrd-util.h" +#include "libmount-util.h" +#include "log.h" +#include "manager.h" +#include "mkdir-label.h" +#include "mount-setup.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "utf8.h" + +#define RETRY_UMOUNT_MAX 32 + +static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = { + [MOUNT_DEAD] = UNIT_INACTIVE, + [MOUNT_MOUNTING] = UNIT_ACTIVATING, + [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING, + [MOUNT_MOUNTED] = UNIT_ACTIVE, + [MOUNT_REMOUNTING] = UNIT_RELOADING, + [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING, + [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING, + [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING, + [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING, + [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING, + [MOUNT_FAILED] = UNIT_FAILED, + [MOUNT_CLEANING] = UNIT_MAINTENANCE, +}; + +static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static void mount_enter_dead(Mount *m, MountResult f); +static void mount_enter_mounted(Mount *m, MountResult f); +static void mount_cycle_clear(Mount *m); +static int mount_process_proc_self_mountinfo(Manager *m); + +static bool MOUNT_STATE_WITH_PROCESS(MountState state) { + return IN_SET(state, + MOUNT_MOUNTING, + MOUNT_MOUNTING_DONE, + MOUNT_REMOUNTING, + MOUNT_REMOUNTING_SIGTERM, + MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL, + MOUNT_CLEANING); +} + +static MountParameters* get_mount_parameters_fragment(Mount *m) { + assert(m); + + if (m->from_fragment) + return &m->parameters_fragment; + + return NULL; +} + +static MountParameters* get_mount_parameters(Mount *m) { + assert(m); + + if (m->from_proc_self_mountinfo) + return &m->parameters_proc_self_mountinfo; + + return get_mount_parameters_fragment(m); +} + +static bool mount_is_network(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "_netdev\0")) + return true; + + if (p->fstype && fstype_is_network(p->fstype)) + return true; + + return false; +} + +static bool mount_is_nofail(const Mount *m) { + assert(m); + + if (!m->from_fragment) + return false; + + return fstab_test_yes_no_option(m->parameters_fragment.options, "nofail\0" "fail\0"); +} + +static bool mount_is_loop(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "loop\0")) + return true; + + return false; +} + +static bool mount_is_bind(const MountParameters *p) { + assert(p); + return fstab_is_bind(p->options, p->fstype); +} + +static int mount_is_bound_to_device(Mount *m) { + _cleanup_free_ char *value = NULL; + const MountParameters *p; + int r; + + assert(m); + + /* Determines whether to place a Requires= or BindsTo= dependency on the backing device unit. We do + * this by checking for the x-systemd.device-bound= mount option. If it is enabled we use BindsTo=, + * otherwise Requires=. But note that we might combine the latter with StopPropagatedFrom=, see + * below. */ + + p = get_mount_parameters(m); + if (!p) + return false; + + r = fstab_filter_options(p->options, "x-systemd.device-bound\0", NULL, &value, NULL, NULL); + if (r < 0) + return r; + if (r == 0) + return -EIDRM; /* If unspecified at all, return recognizable error */ + + if (isempty(value)) + return true; + + return parse_boolean(value); +} + +static bool mount_propagate_stop(Mount *m) { + int r; + + assert(m); + + r = mount_is_bound_to_device(m); + if (r >= 0) + /* If x-systemd.device-bound=no is explicitly requested by user, don't try to set StopPropagatedFrom=. + * Also don't bother if true, since with BindsTo= the stop propagation is implicit. */ + return false; + if (r != -EIDRM) + log_debug_errno(r, "Failed to get x-systemd.device-bound= option, ignoring: %m"); + + return m->from_fragment; /* let's propagate stop whenever this is an explicitly configured unit, + * otherwise let's not bother. */ +} + +static bool mount_needs_quota(const MountParameters *p) { + assert(p); + + if (p->fstype && !fstype_needs_quota(p->fstype)) + return false; + + if (mount_is_bind(p)) + return false; + + return fstab_test_option(p->options, + "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0"); +} + +static void mount_init(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + assert(u); + assert(u->load_state == UNIT_STUB); + + m->timeout_usec = u->manager->defaults.timeout_start_usec; + + m->exec_context.std_output = u->manager->defaults.std_output; + m->exec_context.std_error = u->manager->defaults.std_error; + + m->directory_mode = 0755; + + /* We need to make sure that /usr/bin/mount is always called + * in the same process group as us, so that the autofs kernel + * side doesn't send us another mount request while we are + * already trying to comply its last one. */ + m->exec_context.same_pgrp = true; + + m->control_pid = PIDREF_NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + + u->ignore_on_isolate = true; +} + +static int mount_arm_timer(Mount *m, bool relative, usec_t usec) { + assert(m); + + return unit_arm_timer(UNIT(m), &m->timer_event_source, relative, usec, mount_dispatch_timer); +} + +static void mount_unwatch_control_pid(Mount *m) { + assert(m); + + if (!pidref_is_set(&m->control_pid)) + return; + + unit_unwatch_pidref(UNIT(m), &m->control_pid); + pidref_done(&m->control_pid); +} + +static void mount_parameters_done(MountParameters *p) { + assert(p); + + p->what = mfree(p->what); + p->options = mfree(p->options); + p->fstype = mfree(p->fstype); +} + +static void mount_done(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + m->where = mfree(m->where); + + mount_parameters_done(&m->parameters_proc_self_mountinfo); + mount_parameters_done(&m->parameters_fragment); + + m->exec_runtime = exec_runtime_free(m->exec_runtime); + exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); + m->control_command = NULL; + + mount_unwatch_control_pid(m); + + m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source); +} + +static int update_parameters_proc_self_mountinfo( + Mount *m, + const char *what, + const char *options, + const char *fstype) { + + MountParameters *p; + int r, q, w; + + p = &m->parameters_proc_self_mountinfo; + + r = free_and_strdup(&p->what, what); + if (r < 0) + return r; + + q = free_and_strdup(&p->options, options); + if (q < 0) + return q; + + w = free_and_strdup(&p->fstype, fstype); + if (w < 0) + return w; + + return r > 0 || q > 0 || w > 0; +} + +static int mount_add_mount_dependencies(Mount *m) { + MountParameters *pm; + Unit *other; + Set *s; + int r; + + assert(m); + + if (!path_equal(m->where, "/")) { + _cleanup_free_ char *parent = NULL; + + /* Adds in links to other mount points that might lie further up in the hierarchy */ + + r = path_extract_directory(m->where, &parent); + if (r < 0) + return r; + + r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + } + + /* Adds in dependencies to other mount points that might be needed for the source path (if this is a bind mount + * or a loop mount) to be available. */ + pm = get_mount_parameters_fragment(m); + if (pm && pm->what && + path_is_absolute(pm->what) && + (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) { + + r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */ + s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where); + SET_FOREACH(other, s) { + + if (other->load_state != UNIT_LOADED) + continue; + + if (other == UNIT(m)) + continue; + + r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + + if (UNIT(m)->fragment_path) { + /* If we have fragment configuration, then make this dependency required */ + r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + } + } + + return 0; +} + +static int mount_add_device_dependencies(Mount *m) { + UnitDependencyMask mask; + MountParameters *p; + UnitDependency dep; + int r; + + assert(m); + + log_unit_trace(UNIT(m), "Processing implicit device dependencies"); + + p = get_mount_parameters(m); + if (!p) { + log_unit_trace(UNIT(m), "Missing mount parameters, skipping implicit device dependencies"); + return 0; + } + + if (!p->what) { + log_unit_trace(UNIT(m), "Missing mount source, skipping implicit device dependencies"); + return 0; + } + + if (mount_is_bind(p)) { + log_unit_trace(UNIT(m), "Mount unit is a bind mount, skipping implicit device dependencies"); + return 0; + } + + if (!is_device_path(p->what)) { + log_unit_trace(UNIT(m), "Mount source is not a device path, skipping implicit device dependencies"); + return 0; + } + + /* /dev/root is a really weird thing, it's not a real device, but just a path the kernel exports for + * the root file system specified on the kernel command line. Ignore it here. */ + if (PATH_IN_SET(p->what, "/dev/root", "/dev/nfs")) { + log_unit_trace(UNIT(m), "Mount source is in /dev/root or /dev/nfs, skipping implicit device dependencies"); + return 0; + } + + if (path_equal(m->where, "/")) { + log_unit_trace(UNIT(m), "Mount destination is '/', skipping implicit device dependencies"); + return 0; + } + + /* Mount units from /proc/self/mountinfo are not bound to devices by default since they're subject to + * races when mounts are established by other tools with different backing devices than what we + * maintain. The user can still force this to be a BindsTo= dependency with an appropriate option (or + * udev property) so the mount units are automatically stopped when the device disappears + * suddenly. */ + dep = mount_is_bound_to_device(m) > 0 ? UNIT_BINDS_TO : UNIT_REQUIRES; + + /* We always use 'what' from /proc/self/mountinfo if mounted */ + mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE; + + r = unit_add_node_dependency(UNIT(m), p->what, dep, mask); + if (r < 0) + return r; + if (r > 0) + log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(dep), p->what); + + if (mount_propagate_stop(m)) { + r = unit_add_node_dependency(UNIT(m), p->what, UNIT_STOP_PROPAGATED_FROM, mask); + if (r < 0) + return r; + if (r > 0) + log_unit_trace(UNIT(m), "Added %s dependency on %s", + unit_dependency_to_string(UNIT_STOP_PROPAGATED_FROM), p->what); + } + + r = unit_add_blockdev_dependency(UNIT(m), p->what, mask); + if (r > 0) + log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(UNIT_AFTER), p->what); + + return 0; +} + +static int mount_add_quota_dependencies(Mount *m) { + MountParameters *p; + int r; + + assert(m); + + if (!MANAGER_IS_SYSTEM(UNIT(m)->manager)) + return 0; + + p = get_mount_parameters_fragment(m); + if (!p) + return 0; + + if (!mount_needs_quota(p)) + return 0; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE, + /* add_reference= */ true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE, + /* add_reference= */true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + return 0; +} + +static bool mount_is_extrinsic(Unit *u) { + MountParameters *p; + Mount *m = MOUNT(u); + assert(m); + + /* Returns true for all units that are "magic" and should be excluded from the usual + * start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally + * mounted outside of the systemd dependency logic. We shouldn't attempt to manage them + * ourselves but it's fine if the user operates on them with us. */ + + /* We only automatically manage mounts if we are in system mode */ + if (MANAGER_IS_USER(u->manager)) + return true; + + p = get_mount_parameters(m); + if (p && fstab_is_extrinsic(m->where, p->options)) + return true; + + return false; +} + +static bool mount_is_credentials(Mount *m) { + const char *e; + + assert(m); + + /* Returns true if this is a credentials mount. We don't want automatic dependencies on credential + * mounts, since they are managed by us for even the earliest services, and we never want anything to + * be ordered before them hence. */ + + e = path_startswith(m->where, UNIT(m)->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!e) + return false; + + return !isempty(path_startswith(e, "credentials")); +} + +static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) { + const char *after, *before, *e; + int r; + + assert(m); + + e = path_startswith(m->where, "/sysroot"); + if (e && in_initrd()) { + /* All mounts under /sysroot need to happen later, at initrd-fs.target time. IOW, + * it's not technically part of the basic initrd filesystem itself, and so + * shouldn't inherit the default Before=local-fs.target dependency. However, + * these mounts still need to start after local-fs-pre.target, as a sync point + * for things like systemd-hibernate-resume.service that should start before + * any mounts. */ + + after = SPECIAL_LOCAL_FS_PRE_TARGET; + before = isempty(e) ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_INITRD_FS_TARGET; + + } else if (in_initrd() && path_startswith(m->where, "/sysusr/usr")) { + after = SPECIAL_LOCAL_FS_PRE_TARGET; + before = SPECIAL_INITRD_USR_FS_TARGET; + + } else if (mount_is_credentials(m)) + after = before = NULL; + + else if (mount_is_network(p)) { + after = SPECIAL_REMOTE_FS_PRE_TARGET; + before = SPECIAL_REMOTE_FS_TARGET; + + } else { + after = SPECIAL_LOCAL_FS_PRE_TARGET; + before = SPECIAL_LOCAL_FS_TARGET; + } + + if (before && !mount_is_nofail(m)) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, /* add_reference= */ true, mask); + if (r < 0) + return r; + } + + if (after) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, /* add_reference= */ true, mask); + if (r < 0) + return r; + } + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, + /* add_reference= */ true, mask); + if (r < 0) + return r; + + /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */ + if (streq_ptr(p->fstype, "tmpfs") && !mount_is_credentials(m)) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET, + /* add_reference= */ true, mask); + if (r < 0) + return r; + } + + return 0; +} + +static int mount_add_default_network_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) { + int r; + + assert(m); + + if (!mount_is_network(p)) + return 0; + + /* We order ourselves after network.target. This is primarily useful at shutdown: services that take + * down the network should order themselves before network.target, so that they are shut down only + * after this mount unit is stopped. */ + + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET, + /* add_reference= */ true, mask); + if (r < 0) + return r; + + /* We pull in network-online.target, and order ourselves after it. This is useful at start-up to + * actively pull in tools that want to be started before we start mounting network file systems, and + * whose purpose it is to delay this until the network is "up". */ + + return unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET, + /* add_reference= */ true, mask); +} + +static int mount_add_default_dependencies(Mount *m) { + UnitDependencyMask mask; + MountParameters *p; + int r; + + assert(m); + + if (!UNIT(m)->default_dependencies) + return 0; + + /* We do not add any default dependencies to /, /usr or /run/initramfs/, since they are + * guaranteed to stay mounted the whole time, since our system is on it. Also, don't + * bother with anything mounted below virtual file systems, it's also going to be virtual, + * and hence not worth the effort. */ + if (mount_is_extrinsic(UNIT(m))) + return 0; + + p = get_mount_parameters(m); + if (!p) + return 0; + + mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE; + + r = mount_add_default_ordering_dependencies(m, p, mask); + if (r < 0) + return r; + + r = mount_add_default_network_dependencies(m, p, mask); + if (r < 0) + return r; + + return 0; +} + +static int mount_verify(Mount *m) { + _cleanup_free_ char *e = NULL; + MountParameters *p; + int r; + + assert(m); + assert(UNIT(m)->load_state == UNIT_LOADED); + + if (!m->from_fragment && !m->from_proc_self_mountinfo && !UNIT(m)->perpetual) + return -ENOENT; + + r = unit_name_from_path(m->where, ".mount", &e); + if (r < 0) + return log_unit_error_errno(UNIT(m), r, "Failed to generate unit name from mount path: %m"); + + if (!unit_has_name(UNIT(m), e)) + return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing."); + + if (mount_point_is_api(m->where) || mount_point_ignore(m->where)) + return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Cannot create mount unit for API file system %s. Refusing.", m->where); + + p = get_mount_parameters_fragment(m); + if (p && !p->what && !UNIT(m)->perpetual) + return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), + "What= setting is missing. Refusing."); + + if (m->exec_context.pam_name && m->kill_context.kill_mode != KILL_CONTROL_GROUP) + return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to control-group'. Refusing."); + + return 0; +} + +static int mount_add_non_exec_dependencies(Mount *m) { + int r; + + assert(m); + + /* We may be called due to this mount appearing in /proc/self/mountinfo, hence we clear all existing + * dependencies that were initialized from the unit file but whose final value really depends on the + * content of /proc/self/mountinfo. Some (such as m->where) might have become stale now. */ + unit_remove_dependencies(UNIT(m), UNIT_DEPENDENCY_MOUNTINFO | UNIT_DEPENDENCY_MOUNT_FILE); + + if (!m->where) + return 0; + + /* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies + * resulting from the ExecContext and such. */ + + r = mount_add_device_dependencies(m); + if (r < 0) + return r; + + r = mount_add_mount_dependencies(m); + if (r < 0) + return r; + + r = mount_add_quota_dependencies(m); + if (r < 0) + return r; + + r = mount_add_default_dependencies(m); + if (r < 0) + return r; + + return 0; +} + +static int mount_add_extras(Mount *m) { + Unit *u = UNIT(m); + int r; + + assert(m); + + /* Note: this call might be called after we already have been loaded once (and even when it has already been + * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready + * to run with an already set up unit. */ + + if (u->fragment_path) + m->from_fragment = true; + + if (!m->where) { + r = unit_name_to_path(u->id, &m->where); + if (r == -ENAMETOOLONG) + log_unit_error_errno(u, r, "Failed to derive mount point path from unit name, because unit name is hashed. " + "Set \"Where=\" in the unit file explicitly."); + if (r < 0) + return r; + } + + path_simplify(m->where); + + if (!u->description) { + r = unit_set_description(u, m->where); + if (r < 0) + return r; + } + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(u, &m->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = mount_add_non_exec_dependencies(m); + if (r < 0) + return r; + + return 0; +} + +static void mount_load_root_mount(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_ROOT_MOUNT)) + return; + + u->perpetual = true; + u->default_dependencies = false; + + /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */ + MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL; + MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL; + + if (!u->description) + u->description = strdup("Root Mount"); +} + +static int mount_load(Unit *u) { + Mount *m = MOUNT(u); + int r, q = 0; + + assert(m); + assert(u); + assert(u->load_state == UNIT_STUB); + + mount_load_root_mount(u); + + bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual; + r = unit_load_fragment_and_dropin(u, !fragment_optional); + + /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the + * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus + * we need to update the state in our unit to track it. After all, consider that we don't allow changing the + * 'slice' field for a unit once it is active. */ + if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual) + q = mount_add_extras(m); + + if (r < 0) + return r; + if (q < 0) + return q; + if (u->load_state != UNIT_LOADED) + return 0; + + return mount_verify(m); +} + +static void mount_set_state(Mount *m, MountState state) { + MountState old_state; + assert(m); + + if (m->state != state) + bus_unit_send_pending_change_signal(UNIT(m), false); + + old_state = m->state; + m->state = state; + + if (!MOUNT_STATE_WITH_PROCESS(state)) { + m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source); + mount_unwatch_control_pid(m); + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + } + + if (state != old_state) + log_unit_debug(UNIT(m), "Changed %s -> %s", mount_state_to_string(old_state), mount_state_to_string(state)); + + unit_notify(UNIT(m), state_translation_table[old_state], state_translation_table[state], m->reload_result == MOUNT_SUCCESS); +} + +static int mount_coldplug(Unit *u) { + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(m->state == MOUNT_DEAD); + + if (m->deserialized_state == m->state) + return 0; + + if (pidref_is_set(&m->control_pid) && + pidref_is_unwaited(&m->control_pid) > 0 && + MOUNT_STATE_WITH_PROCESS(m->deserialized_state)) { + + r = unit_watch_pidref(UNIT(m), &m->control_pid, /* exclusive= */ false); + if (r < 0) + return r; + + r = mount_arm_timer(m, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, m->timeout_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED)) + (void) unit_setup_exec_runtime(u); + + mount_set_state(m, m->deserialized_state); + return 0; +} + +static void mount_catchup(Unit *u) { + Mount *m = MOUNT(ASSERT_PTR(u)); + + assert(m); + + /* Adjust the deserialized state. See comments in mount_process_proc_self_mountinfo(). */ + if (m->from_proc_self_mountinfo) + switch (m->state) { + case MOUNT_DEAD: + case MOUNT_FAILED: + assert(!pidref_is_set(&m->control_pid)); + (void) unit_acquire_invocation_id(u); + mount_cycle_clear(m); + mount_enter_mounted(m, MOUNT_SUCCESS); + break; + case MOUNT_MOUNTING: + assert(pidref_is_set(&m->control_pid)); + mount_set_state(m, MOUNT_MOUNTING_DONE); + break; + default: + break; + } + else + switch (m->state) { + case MOUNT_MOUNTING_DONE: + assert(pidref_is_set(&m->control_pid)); + mount_set_state(m, MOUNT_MOUNTING); + break; + case MOUNT_MOUNTED: + assert(!pidref_is_set(&m->control_pid)); + mount_enter_dead(m, MOUNT_SUCCESS); + break; + default: + break; + } +} + +static void mount_dump(Unit *u, FILE *f, const char *prefix) { + Mount *m = MOUNT(u); + MountParameters *p; + + assert(m); + assert(f); + + p = get_mount_parameters(m); + + fprintf(f, + "%sMount State: %s\n" + "%sResult: %s\n" + "%sClean Result: %s\n" + "%sWhere: %s\n" + "%sWhat: %s\n" + "%sFile System Type: %s\n" + "%sOptions: %s\n" + "%sFrom /proc/self/mountinfo: %s\n" + "%sFrom fragment: %s\n" + "%sExtrinsic: %s\n" + "%sDirectoryMode: %04o\n" + "%sSloppyOptions: %s\n" + "%sLazyUnmount: %s\n" + "%sForceUnmount: %s\n" + "%sReadWriteOnly: %s\n" + "%sTimeoutSec: %s\n", + prefix, mount_state_to_string(m->state), + prefix, mount_result_to_string(m->result), + prefix, mount_result_to_string(m->clean_result), + prefix, m->where, + prefix, p ? strna(p->what) : "n/a", + prefix, p ? strna(p->fstype) : "n/a", + prefix, p ? strna(p->options) : "n/a", + prefix, yes_no(m->from_proc_self_mountinfo), + prefix, yes_no(m->from_fragment), + prefix, yes_no(mount_is_extrinsic(u)), + prefix, m->directory_mode, + prefix, yes_no(m->sloppy_options), + prefix, yes_no(m->lazy_unmount), + prefix, yes_no(m->force_unmount), + prefix, yes_no(m->read_write_only), + prefix, FORMAT_TIMESPAN(m->timeout_usec, USEC_PER_SEC)); + + if (pidref_is_set(&m->control_pid)) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, m->control_pid.pid); + + exec_context_dump(&m->exec_context, f, prefix); + kill_context_dump(&m->kill_context, f, prefix); + cgroup_context_dump(UNIT(m), f, prefix); +} + +static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) { + + _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT( + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN); + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + pid_t pid; + int r; + + assert(m); + assert(c); + assert(ret_pid); + + r = unit_prepare_exec(UNIT(m)); + if (r < 0) + return r; + + r = mount_arm_timer(m, /* relative= */ true, m->timeout_usec); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(m), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(m), + c, + &m->exec_context, + &exec_params, + m->exec_runtime, + &m->cgroup_context, + &pid); + if (r < 0) + return r; + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return r; + + r = unit_watch_pidref(UNIT(m), &pidref, /* exclusive= */ true); + if (r < 0) + return r; + + *ret_pid = TAKE_PIDREF(pidref); + return 0; +} + +static void mount_enter_dead(Mount *m, MountResult f) { + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result)); + unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_stop); + + mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD); + + m->exec_runtime = exec_runtime_destroy(m->exec_runtime); + + unit_destroy_runtime_data(UNIT(m), &m->exec_context); + + unit_unref_uid_gid(UNIT(m), true); + + /* Any dependencies based on /proc/self/mountinfo are now stale. Let's re-generate dependencies from + * .mount unit. */ + (void) mount_add_non_exec_dependencies(m); +} + +static void mount_enter_mounted(Mount *m, MountResult f) { + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + mount_set_state(m, MOUNT_MOUNTED); +} + +static void mount_enter_dead_or_mounted(Mount *m, MountResult f) { + assert(m); + + /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this + * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all + * ultimately we just mirror the kernel's internal state on this. */ + + if (m->from_proc_self_mountinfo) + mount_enter_mounted(m, f); + else + mount_enter_dead(m, f); +} + +static int state_to_kill_operation(MountState state) { + switch (state) { + + case MOUNT_REMOUNTING_SIGTERM: + return KILL_RESTART; + + case MOUNT_UNMOUNTING_SIGTERM: + return KILL_TERMINATE; + + case MOUNT_REMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGKILL: + return KILL_KILL; + + default: + return _KILL_OPERATION_INVALID; + } +} + +static void mount_enter_signal(Mount *m, MountState state, MountResult f) { + int r; + + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + r = unit_kill_context( + UNIT(m), + &m->kill_context, + state_to_kill_operation(state), + /* main_pid= */ NULL, + &m->control_pid, + /* main_pid_alien= */ false); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m"); + goto fail; + } + + if (r > 0) { + r = mount_arm_timer(m, /* relative= */ true, m->timeout_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to install timer: %m"); + goto fail; + } + + mount_set_state(m, state); + } else if (state == MOUNT_REMOUNTING_SIGTERM && m->kill_context.send_sigkill) + mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); + else if (IN_SET(state, MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL)) + mount_enter_mounted(m, MOUNT_SUCCESS); + else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill) + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS); + else + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + + return; + +fail: + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static int mount_set_umount_command(Mount *m, ExecCommand *c) { + int r; + + assert(m); + assert(c); + + r = exec_command_set(c, UMOUNT_PATH, m->where, "-c", NULL); + if (r < 0) + return r; + + if (m->lazy_unmount) { + r = exec_command_append(c, "-l", NULL); + if (r < 0) + return r; + } + + if (m->force_unmount) { + r = exec_command_append(c, "-f", NULL); + if (r < 0) + return r; + } + + return 0; +} + +static void mount_enter_unmounting(Mount *m) { + int r; + + assert(m); + + /* Start counting our attempts */ + if (!IN_SET(m->state, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL)) + m->n_retry_umount = 0; + + m->control_command_id = MOUNT_EXEC_UNMOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_UNMOUNT; + + r = mount_set_umount_command(m, m->control_command); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to prepare umount command line: %m"); + goto fail; + } + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'umount' task: %m"); + goto fail; + } + + mount_set_state(m, MOUNT_UNMOUNTING); + + return; + +fail: + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static int mount_set_mount_command(Mount *m, ExecCommand *c, const MountParameters *p) { + int r; + + assert(m); + assert(c); + assert(p); + + r = exec_command_set(c, MOUNT_PATH, p->what, m->where, NULL); + if (r < 0) + return r; + + if (m->sloppy_options) { + r = exec_command_append(c, "-s", NULL); + if (r < 0) + return r; + } + + if (m->read_write_only) { + r = exec_command_append(c, "-w", NULL); + if (r < 0) + return r; + } + + if (p->fstype) { + r = exec_command_append(c, "-t", p->fstype, NULL); + if (r < 0) + return r; + } + + _cleanup_free_ char *opts = NULL; + r = fstab_filter_options(p->options, "nofail\0" "noauto\0" "auto\0", NULL, NULL, NULL, &opts); + if (r < 0) + return r; + + if (!isempty(opts)) { + r = exec_command_append(c, "-o", opts, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static void mount_enter_mounting(Mount *m) { + int r; + MountParameters *p; + bool source_is_dir = true; + + assert(m); + + r = unit_fail_if_noncanonical(UNIT(m), m->where); + if (r < 0) + goto fail; + + p = get_mount_parameters_fragment(m); + if (p && mount_is_bind(p)) { + r = is_dir(p->what, /* follow = */ true); + if (r < 0 && r != -ENOENT) + log_unit_info_errno(UNIT(m), r, "Failed to determine type of bind mount source '%s', ignoring: %m", p->what); + else if (r == 0) + source_is_dir = false; + } + + if (source_is_dir) + r = mkdir_p_label(m->where, m->directory_mode); + else + r = touch_file(m->where, /* parents = */ true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); + if (r < 0 && r != -EEXIST) + log_unit_warning_errno(UNIT(m), r, "Failed to create mount point '%s', ignoring: %m", m->where); + + if (source_is_dir) + unit_warn_if_dir_nonempty(UNIT(m), m->where); + unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start); + + m->control_command_id = MOUNT_EXEC_MOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_MOUNT; + + /* Create the source directory for bind-mounts if needed */ + if (p && mount_is_bind(p)) { + r = mkdir_p_label(p->what, m->directory_mode); + /* mkdir_p_label() can return -EEXIST if the target path exists and is not a directory - which is + * totally OK, in case the user wants us to overmount a non-directory inode. Also -EROFS can be + * returned on read-only filesystem. Moreover, -EACCES (and also maybe -EPERM?) may be returned + * when the path is on NFS. See issue #24120. All such errors will be logged in the debug level. */ + if (r < 0 && r != -EEXIST) + log_unit_full_errno(UNIT(m), + (r == -EROFS || ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to make bind mount source '%s', ignoring: %m", p->what); + } + + if (p) { + r = mount_set_mount_command(m, m->control_command, p); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to prepare mount command line: %m"); + goto fail; + } + } else { + r = log_unit_warning_errno(UNIT(m), SYNTHETIC_ERRNO(ENOENT), "No mount parameters to operate on."); + goto fail; + } + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'mount' task: %m"); + goto fail; + } + + mount_set_state(m, MOUNT_MOUNTING); + return; + +fail: + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_set_reload_result(Mount *m, MountResult result) { + assert(m); + + /* Only store the first error we encounter */ + if (m->reload_result != MOUNT_SUCCESS) + return; + + m->reload_result = result; +} + +static void mount_enter_remounting(Mount *m) { + int r; + MountParameters *p; + + assert(m); + + /* Reset reload result when we are about to start a new remount operation */ + m->reload_result = MOUNT_SUCCESS; + + m->control_command_id = MOUNT_EXEC_REMOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_REMOUNT; + + p = get_mount_parameters_fragment(m); + if (p) { + const char *o; + + if (p->options) + o = strjoina("remount,", p->options); + else + o = "remount"; + + r = exec_command_set(m->control_command, MOUNT_PATH, + p->what, m->where, + "-o", o, NULL); + if (r >= 0 && m->sloppy_options) + r = exec_command_append(m->control_command, "-s", NULL); + if (r >= 0 && m->read_write_only) + r = exec_command_append(m->control_command, "-w", NULL); + if (r >= 0 && p->fstype) + r = exec_command_append(m->control_command, "-t", p->fstype, NULL); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to prepare remount command line: %m"); + goto fail; + } + + } else { + r = log_unit_warning_errno(UNIT(m), SYNTHETIC_ERRNO(ENOENT), "No mount parameters to operate on."); + goto fail; + } + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'remount' task: %m"); + goto fail; + } + + mount_set_state(m, MOUNT_REMOUNTING); + return; + +fail: + mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); +} + +static void mount_cycle_clear(Mount *m) { + assert(m); + + /* Clear all state we shall forget for this new cycle */ + + m->result = MOUNT_SUCCESS; + m->reload_result = MOUNT_SUCCESS; + exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); + UNIT(m)->reset_accounting = true; +} + +static int mount_start(Unit *u) { + Mount *m = MOUNT(u); + int r; + + assert(m); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(m->state, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL, + MOUNT_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(m->state, MOUNT_MOUNTING, MOUNT_MOUNTING_DONE)) + return 0; + + assert(IN_SET(m->state, MOUNT_DEAD, MOUNT_FAILED)); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + mount_cycle_clear(m); + mount_enter_mounting(m); + + return 1; +} + +static int mount_stop(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + /* When we directly call umount() for a path, then the state of the corresponding mount unit may be + * outdated. Let's re-read mountinfo now and update the state. */ + if (m->invalidated_state) + (void) mount_process_proc_self_mountinfo(u->manager); + + switch (m->state) { + + case MOUNT_UNMOUNTING: + case MOUNT_UNMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGTERM: + /* Already on it */ + return 0; + + case MOUNT_MOUNTING: + case MOUNT_MOUNTING_DONE: + case MOUNT_REMOUNTING: + /* If we are still waiting for /bin/mount, we go directly into kill mode. */ + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_SUCCESS); + return 0; + + case MOUNT_REMOUNTING_SIGTERM: + /* If we are already waiting for a hung remount, convert this to the matching unmounting state */ + mount_set_state(m, MOUNT_UNMOUNTING_SIGTERM); + return 0; + + case MOUNT_REMOUNTING_SIGKILL: + /* as above */ + mount_set_state(m, MOUNT_UNMOUNTING_SIGKILL); + return 0; + + case MOUNT_MOUNTED: + mount_enter_unmounting(m); + return 1; + + case MOUNT_CLEANING: + /* If we are currently cleaning, then abort it, brutally. */ + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS); + return 0; + + case MOUNT_DEAD: + case MOUNT_FAILED: + /* The mount has just been unmounted by somebody else. */ + return 0; + + default: + assert_not_reached(); + } +} + +static int mount_reload(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + assert(m->state == MOUNT_MOUNTED); + + mount_enter_remounting(m); + + return 1; +} + +static int mount_serialize(Unit *u, FILE *f, FDSet *fds) { + Mount *m = MOUNT(u); + + assert(m); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", mount_state_to_string(m->state)); + (void) serialize_item(f, "result", mount_result_to_string(m->result)); + (void) serialize_item(f, "reload-result", mount_result_to_string(m->reload_result)); + (void) serialize_item_format(f, "n-retry-umount", "%u", m->n_retry_umount); + (void) serialize_pidref(f, fds, "control-pid", &m->control_pid); + + if (m->control_command_id >= 0) + (void) serialize_item(f, "control-command", mount_exec_command_to_string(m->control_command_id)); + + return 0; +} + +static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + MountState state; + + state = mount_state_from_string(value); + if (state < 0) + log_unit_debug_errno(u, state, "Failed to parse state value: %s", value); + else + m->deserialized_state = state; + + } else if (streq(key, "result")) { + MountResult f; + + f = mount_result_from_string(value); + if (f < 0) + log_unit_debug_errno(u, f, "Failed to parse result value: %s", value); + else if (f != MOUNT_SUCCESS) + m->result = f; + + } else if (streq(key, "reload-result")) { + MountResult f; + + f = mount_result_from_string(value); + if (f < 0) + log_unit_debug_errno(u, f, "Failed to parse reload result value: %s", value); + else if (f != MOUNT_SUCCESS) + m->reload_result = f; + + } else if (streq(key, "n-retry-umount")) { + + r = safe_atou(value, &m->n_retry_umount); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse n-retry-umount value: %s", value); + + } else if (streq(key, "control-pid")) { + + pidref_done(&m->control_pid); + (void) deserialize_pidref(fds, value, &m->control_pid); + + } else if (streq(key, "control-command")) { + MountExecCommand id; + + id = mount_exec_command_from_string(value); + if (id < 0) + log_unit_debug_errno(u, id, "Failed to parse exec-command value: %s", value); + else { + m->control_command_id = id; + m->control_command = m->exec_command + id; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState mount_active_state(Unit *u) { + assert(u); + + return state_translation_table[MOUNT(u)->state]; +} + +static const char *mount_sub_state_to_string(Unit *u) { + assert(u); + + return mount_state_to_string(MOUNT(u)->state); +} + +static bool mount_may_gc(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + if (m->from_proc_self_mountinfo) + return false; + + return true; +} + +static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Mount *m = MOUNT(u); + MountResult f; + + assert(m); + assert(pid >= 0); + + if (pid != m->control_pid.pid) + return; + + /* So here's the thing, we really want to know before /usr/bin/mount or /usr/bin/umount exit whether + * they established/remove a mount. This is important when mounting, but even more so when unmounting + * since we need to deal with nested mounts and otherwise cannot safely determine whether to repeat + * the unmounts. In theory, the kernel fires /proc/self/mountinfo changes off before returning from + * the mount() or umount() syscalls, and thus we should see the changes to the proc file before we + * process the waitid() for the /usr/bin/(u)mount processes. However, this is unfortunately racy: we + * have to waitid() for processes using P_ALL (since we need to reap unexpected children that got + * reparented to PID 1), but when using P_ALL we might end up reaping processes that terminated just + * instants ago, i.e. already after our last event loop iteration (i.e. after the last point we might + * have noticed /proc/self/mountinfo events via epoll). This means event loop priorities for + * processing SIGCHLD vs. /proc/self/mountinfo IO events are not as relevant as we want. To fix that + * race, let's explicitly scan /proc/self/mountinfo before we start processing /usr/bin/(u)mount + * dying. It's ugly, but it makes our ordering systematic again, and makes sure we always see + * /proc/self/mountinfo changes before our mount/umount exits. */ + (void) mount_process_proc_self_mountinfo(u->manager); + + pidref_done(&m->control_pid); + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = MOUNT_SUCCESS; + else if (code == CLD_EXITED) + f = MOUNT_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = MOUNT_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = MOUNT_FAILURE_CORE_DUMP; + else + assert_not_reached(); + + if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM)) + mount_set_reload_result(m, f); + else if (m->result == MOUNT_SUCCESS) + m->result = f; + + if (m->control_command) { + exec_status_exit(&m->control_command->exec_status, &m->exec_context, pid, code, status); + + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + } + + unit_log_process_exit( + u, + "Mount process", + mount_exec_command_to_string(m->control_command_id), + f == MOUNT_SUCCESS, + code, status); + + /* Note that due to the io event priority logic, we can be sure the new mountinfo is loaded + * before we process the SIGCHLD for the mount command. */ + + switch (m->state) { + + case MOUNT_MOUNTING: + /* Our mount point has not appeared in mountinfo. Something went wrong. */ + + if (f == MOUNT_SUCCESS) { + /* Either /bin/mount has an unexpected definition of success, + * or someone raced us and we lost. */ + log_unit_warning(UNIT(m), "Mount process finished, but there is no mount."); + f = MOUNT_FAILURE_PROTOCOL; + } + mount_enter_dead(m, f); + break; + + case MOUNT_MOUNTING_DONE: + mount_enter_mounted(m, f); + break; + + case MOUNT_REMOUNTING: + case MOUNT_REMOUNTING_SIGTERM: + case MOUNT_REMOUNTING_SIGKILL: + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + break; + + case MOUNT_UNMOUNTING: + + if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) { + + /* Still a mount point? If so, let's try again. Most likely there were multiple mount points + * stacked on top of each other. We might exceed the timeout specified by the user overall, + * but we will stop as soon as any one umount times out. */ + + if (m->n_retry_umount < RETRY_UMOUNT_MAX) { + log_unit_debug(u, "Mount still present, trying again."); + m->n_retry_umount++; + mount_enter_unmounting(m); + } else { + log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount); + mount_enter_mounted(m, f); + } + } else + mount_enter_dead_or_mounted(m, f); + + break; + + case MOUNT_UNMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGTERM: + mount_enter_dead_or_mounted(m, f); + break; + + case MOUNT_CLEANING: + if (m->clean_result == MOUNT_SUCCESS) + m->clean_result = f; + + mount_enter_dead(m, MOUNT_SUCCESS); + break; + + default: + assert_not_reached(); + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Mount *m = MOUNT(userdata); + + assert(m); + assert(m->timer_event_source == source); + + switch (m->state) { + + case MOUNT_MOUNTING: + case MOUNT_MOUNTING_DONE: + log_unit_warning(UNIT(m), "Mounting timed out. Terminating."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_REMOUNTING: + log_unit_warning(UNIT(m), "Remounting timed out. Terminating remount process."); + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + mount_enter_signal(m, MOUNT_REMOUNTING_SIGTERM, MOUNT_SUCCESS); + break; + + case MOUNT_REMOUNTING_SIGTERM: + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + + if (m->kill_context.send_sigkill) { + log_unit_warning(UNIT(m), "Remounting timed out. Killing."); + mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); + } else { + log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + } + break; + + case MOUNT_REMOUNTING_SIGKILL: + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + + log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + break; + + case MOUNT_UNMOUNTING: + log_unit_warning(UNIT(m), "Unmounting timed out. Terminating."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_UNMOUNTING_SIGTERM: + if (m->kill_context.send_sigkill) { + log_unit_warning(UNIT(m), "Mount process timed out. Killing."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + } + break; + + case MOUNT_UNMOUNTING_SIGKILL: + log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_CLEANING: + log_unit_warning(UNIT(m), "Cleaning timed out. killing."); + + if (m->clean_result == MOUNT_SUCCESS) + m->clean_result = MOUNT_FAILURE_TIMEOUT; + + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, 0); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int mount_setup_new_unit( + Manager *m, + const char *name, + const char *what, + const char *where, + const char *options, + const char *fstype, + MountProcFlags *ret_flags, + Unit **ret) { + + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + assert(m); + assert(name); + assert(ret_flags); + assert(ret); + + r = unit_new_for_name(m, sizeof(Mount), name, &u); + if (r < 0) + return r; + + r = free_and_strdup(&u->source_path, "/proc/self/mountinfo"); + if (r < 0) + return r; + + r = free_and_strdup(&MOUNT(u)->where, where); + if (r < 0) + return r; + + r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype); + if (r < 0) + return r; + + /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the + * time we load the unit file for it (and thus add in extra deps right after) we know what source to + * attributes the deps to. */ + MOUNT(u)->from_proc_self_mountinfo = true; + + r = mount_add_non_exec_dependencies(MOUNT(u)); + if (r < 0) + return r; + + /* We have only allocated the stub now, let's enqueue this unit for loading now, so that everything + * else is loaded in now. */ + unit_add_to_load_queue(u); + + *ret_flags = MOUNT_PROC_IS_MOUNTED | MOUNT_PROC_JUST_MOUNTED | MOUNT_PROC_JUST_CHANGED; + *ret = TAKE_PTR(u); + return 0; +} + +static int mount_setup_existing_unit( + Unit *u, + const char *what, + const char *where, + const char *options, + const char *fstype, + MountProcFlags *ret_flags) { + + int r; + + assert(u); + assert(ret_flags); + + if (!MOUNT(u)->where) { + MOUNT(u)->where = strdup(where); + if (!MOUNT(u)->where) + return -ENOMEM; + } + + /* In case we have multiple mounts established on the same mount point, let's merge flags set already + * for the current unit. Note that the flags field is reset on each iteration of reading + * /proc/self/mountinfo, hence we know for sure anything already set here is from the current + * iteration and thus worthy of taking into account. */ + MountProcFlags flags = + MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED; + + r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype); + if (r < 0) + return r; + if (r > 0) + flags |= MOUNT_PROC_JUST_CHANGED; + + /* There are two conditions when we consider a mount point just mounted: when we haven't seen it in + * /proc/self/mountinfo before or when MOUNT_MOUNTING is our current state. Why bother with the + * latter? Shouldn't that be covered by the former? No, during reload it is not because we might then + * encounter a new /proc/self/mountinfo in combination with an old mount unit state (since it stems + * from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is + * reached when we wait for the mount to appear we hence can assume that if we are in it, we are + * actually seeing it established for the first time. */ + if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING) + flags |= MOUNT_PROC_JUST_MOUNTED; + + MOUNT(u)->from_proc_self_mountinfo = true; + + if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in + * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */ + u->load_state = UNIT_LOADED; + u->load_error = 0; + + flags |= MOUNT_PROC_JUST_CHANGED; + } + + if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) { + /* If things changed, then make sure that all deps are regenerated. Let's + * first remove all automatic deps, and then add in the new ones. */ + r = mount_add_non_exec_dependencies(MOUNT(u)); + if (r < 0) + return r; + } + + *ret_flags = flags; + return 0; +} + +static int mount_setup_unit( + Manager *m, + const char *what, + const char *where, + const char *options, + const char *fstype, + bool set_flags) { + + _cleanup_free_ char *e = NULL; + MountProcFlags flags; + Unit *u; + int r; + + assert(m); + assert(what); + assert(where); + assert(options); + assert(fstype); + + /* Ignore API mount points. They should never be referenced in + * dependencies ever. */ + if (mount_point_is_api(where) || mount_point_ignore(where)) + return 0; + + if (streq(fstype, "autofs")) + return 0; + + /* probably some kind of swap, ignore */ + if (!is_path(where)) + return 0; + + r = unit_name_from_path(where, ".mount", &e); + if (r < 0) + return log_struct_errno( + LOG_WARNING, r, + "MESSAGE_ID=" SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE_STR, + "MOUNT_POINT=%s", where, + LOG_MESSAGE("Failed to generate valid unit name from mount point path '%s', ignoring mount point: %m", + where)); + + u = manager_get_unit(m, e); + if (u) + r = mount_setup_existing_unit(u, what, where, options, fstype, &flags); + else + /* First time we see this mount point meaning that it's not been initiated by a mount unit + * but rather by the sysadmin having called mount(8) directly. */ + r = mount_setup_new_unit(m, e, what, where, options, fstype, &flags, &u); + if (r < 0) + return log_warning_errno(r, "Failed to set up mount unit for '%s': %m", where); + + /* If the mount changed properties or state, let's notify our clients */ + if (flags & (MOUNT_PROC_JUST_CHANGED|MOUNT_PROC_JUST_MOUNTED)) + unit_add_to_dbus_queue(u); + + if (set_flags) + MOUNT(u)->proc_flags = flags; + + return 0; +} + +static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + int r; + + assert(m); + + r = libmount_parse(NULL, NULL, &table, &iter); + if (r < 0) + return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + struct libmnt_fs *fs; + const char *device, *path, *options, *fstype; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + + device = mnt_fs_get_source(fs); + path = mnt_fs_get_target(fs); + options = mnt_fs_get_options(fs); + fstype = mnt_fs_get_fstype(fs); + + if (!device || !path) + continue; + + device_found_node(m, device, DEVICE_FOUND_MOUNT, DEVICE_FOUND_MOUNT); + + (void) mount_setup_unit(m, device, path, options, fstype, set_flags); + } + + return 0; +} + +static void mount_shutdown(Manager *m) { + assert(m); + + m->mount_event_source = sd_event_source_disable_unref(m->mount_event_source); + + mnt_unref_monitor(m->mount_monitor); + m->mount_monitor = NULL; +} + +static int mount_get_timeout(Unit *u, usec_t *timeout) { + Mount *m = MOUNT(u); + usec_t t; + int r; + + assert(m); + assert(u); + + if (!m->timer_event_source) + return 0; + + r = sd_event_source_get_time(m->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static void mount_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Whatever happens, we know for sure that the root directory is around, and cannot go away. Let's + * unconditionally synthesize it here and mark it as perpetual. */ + + u = manager_get_unit(m, SPECIAL_ROOT_MOUNT); + if (!u) { + r = unit_new_for_name(m, sizeof(Mount), SPECIAL_ROOT_MOUNT, &u); + if (r < 0) { + log_error_errno(r, "Failed to allocate the special " SPECIAL_ROOT_MOUNT " unit: %m"); + return; + } + } + + u->perpetual = true; + MOUNT(u)->deserialized_state = MOUNT_MOUNTED; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); +} + +static bool mount_is_mounted(Mount *m) { + assert(m); + + return UNIT(m)->perpetual || FLAGS_SET(m->proc_flags, MOUNT_PROC_IS_MOUNTED); +} + +static int mount_on_ratelimit_expire(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Job *j; + + /* Let's enqueue all start jobs that were previously skipped because of active ratelimit. */ + HASHMAP_FOREACH(j, m->jobs) { + if (j->unit->type != UNIT_MOUNT) + continue; + + job_add_to_run_queue(j); + } + + /* By entering ratelimited state we made all mount start jobs not runnable, now rate limit is over so + * let's make sure we dispatch them in the next iteration. */ + manager_trigger_run_queue(m); + + return 0; +} + +static void mount_enumerate(Manager *m) { + int r; + + assert(m); + + mnt_init_debug(0); + + if (!m->mount_monitor) { + unsigned mount_rate_limit_burst = 5; + int fd; + + m->mount_monitor = mnt_new_monitor(); + if (!m->mount_monitor) { + log_oom(); + goto fail; + } + + r = mnt_monitor_enable_kernel(m->mount_monitor, 1); + if (r < 0) { + log_error_errno(r, "Failed to enable watching of kernel mount events: %m"); + goto fail; + } + + r = mnt_monitor_enable_userspace(m->mount_monitor, 1, NULL); + if (r < 0) { + log_error_errno(r, "Failed to enable watching of userspace mount events: %m"); + goto fail; + } + + /* mnt_unref_monitor() will close the fd */ + fd = r = mnt_monitor_get_fd(m->mount_monitor); + if (r < 0) { + log_error_errno(r, "Failed to acquire watch file descriptor: %m"); + goto fail; + } + + r = sd_event_add_io(m->event, &m->mount_event_source, fd, EPOLLIN, mount_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to watch mount file descriptor: %m"); + goto fail; + } + + r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) { + log_error_errno(r, "Failed to adjust mount watch priority: %m"); + goto fail; + } + + /* Let users override the default (5 in 1s), as it stalls the boot sequence on busy systems. */ + const char *e = secure_getenv("SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST"); + if (e) { + r = safe_atou(e, &mount_rate_limit_burst); + if (r < 0) + log_debug("Invalid value in $SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST, ignoring: %s", e); + } + + r = sd_event_source_set_ratelimit(m->mount_event_source, 1 * USEC_PER_SEC, mount_rate_limit_burst); + if (r < 0) { + log_error_errno(r, "Failed to enable rate limit for mount events: %m"); + goto fail; + } + + r = sd_event_source_set_ratelimit_expire_callback(m->mount_event_source, mount_on_ratelimit_expire); + if (r < 0) { + log_error_errno(r, "Failed to enable rate limit for mount events: %m"); + goto fail; + } + + (void) sd_event_source_set_description(m->mount_event_source, "mount-monitor-dispatch"); + } + + r = mount_load_proc_self_mountinfo(m, false); + if (r < 0) + goto fail; + + return; + +fail: + mount_shutdown(m); +} + +static int drain_libmount(Manager *m) { + bool rescan = false; + int r; + + assert(m); + + /* Drain all events and verify that the event is valid. + * + * Note that libmount also monitors /run/mount mkdir if the directory does not exist yet. The mkdir + * may generate event which is irrelevant for us. + * + * error: r < 0; valid: r == 0, false positive: r == 1 */ + do { + r = mnt_monitor_next_change(m->mount_monitor, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to drain libmount events: %m"); + if (r == 0) + rescan = true; + } while (r == 0); + + return rescan; +} + +static int mount_process_proc_self_mountinfo(Manager *m) { + _cleanup_set_free_ Set *around = NULL, *gone = NULL; + const char *what; + int r; + + assert(m); + + r = drain_libmount(m); + if (r <= 0) + return r; + + r = mount_load_proc_self_mountinfo(m, true); + if (r < 0) { + /* Reset flags, just in case, for later calls */ + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) + MOUNT(u)->proc_flags = 0; + + return 0; + } + + manager_dispatch_load_queue(m); + + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) { + Mount *mount = MOUNT(u); + + mount->invalidated_state = false; + + if (!mount_is_mounted(mount)) { + + /* A mount point is not around right now. It might be gone, or might never have + * existed. */ + + if (mount->from_proc_self_mountinfo && + mount->parameters_proc_self_mountinfo.what) + /* Remember that this device might just have disappeared */ + if (set_put_strdup_full(&gone, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0) + log_oom(); /* we don't care too much about OOM here... */ + + mount->from_proc_self_mountinfo = false; + assert_se(update_parameters_proc_self_mountinfo(mount, NULL, NULL, NULL) >= 0); + + switch (mount->state) { + + case MOUNT_MOUNTED: + /* This has just been unmounted by somebody else, follow the state change. */ + mount_enter_dead(mount, MOUNT_SUCCESS); + break; + + case MOUNT_MOUNTING_DONE: + /* The mount command may add the corresponding proc mountinfo entry and + * then remove it because of an internal error. E.g., fuse.sshfs seems + * to do that when the connection fails. See #17617. To handle such the + * case, let's once set the state back to mounting. Then, the unit can + * correctly enter the failed state later in mount_sigchld(). */ + mount_set_state(mount, MOUNT_MOUNTING); + break; + + default: + break; + } + + } else if (mount->proc_flags & (MOUNT_PROC_JUST_MOUNTED|MOUNT_PROC_JUST_CHANGED)) { + + /* A mount point was added or changed */ + + switch (mount->state) { + + case MOUNT_DEAD: + case MOUNT_FAILED: + + /* This has just been mounted by somebody else, follow the state change, but let's + * generate a new invocation ID for this implicitly and automatically. */ + (void) unit_acquire_invocation_id(u); + mount_cycle_clear(mount); + mount_enter_mounted(mount, MOUNT_SUCCESS); + break; + + case MOUNT_MOUNTING: + mount_set_state(mount, MOUNT_MOUNTING_DONE); + break; + + default: + /* Nothing really changed, but let's issue an notification call nonetheless, + * in case somebody is waiting for this. (e.g. file system ro/rw + * remounts.) */ + mount_set_state(mount, mount->state); + break; + } + } + + if (mount_is_mounted(mount) && + mount->from_proc_self_mountinfo && + mount->parameters_proc_self_mountinfo.what) + /* Track devices currently used */ + if (set_put_strdup_full(&around, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0) + log_oom(); + + /* Reset the flags for later calls */ + mount->proc_flags = 0; + } + + SET_FOREACH(what, gone) { + if (set_contains(around, what)) + continue; + + /* Let the device units know that the device is no longer mounted */ + device_found_node(m, what, DEVICE_NOT_FOUND, DEVICE_FOUND_MOUNT); + } + + return 0; +} + +static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(revents & EPOLLIN); + + return mount_process_proc_self_mountinfo(m); +} + +int mount_invalidate_state_by_path(Manager *manager, const char *path) { + _cleanup_free_ char *name = NULL; + Unit *u; + int r; + + assert(manager); + assert(path); + + r = unit_name_from_path(path, ".mount", &name); + if (r < 0) + return log_debug_errno(r, "Failed to generate unit name from path \"%s\", ignoring: %m", path); + + u = manager_get_unit(manager, name); + if (!u) + return -ENOENT; + + MOUNT(u)->invalidated_state = true; + return 0; +} + +static void mount_reset_failed(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + if (m->state == MOUNT_FAILED) + mount_set_state(m, MOUNT_DEAD); + + m->result = MOUNT_SUCCESS; + m->reload_result = MOUNT_SUCCESS; + m->clean_result = MOUNT_SUCCESS; +} + +static PidRef* mount_control_pid(Unit *u) { + return &ASSERT_PTR(MOUNT(u))->control_pid; +} + +static int mount_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(mask != 0); + + if (m->state != MOUNT_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&m->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + mount_unwatch_control_pid(m); + m->clean_result = MOUNT_SUCCESS; + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + + r = mount_arm_timer(m, /* relative= */ true, m->exec_context.timeout_clean_usec); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to install timer: %m"); + goto fail; + } + + r = unit_fork_and_watch_rm_rf(u, l, &m->control_pid); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m"); + goto fail; + } + + mount_set_state(m, MOUNT_CLEANING); + return 0; + +fail: + m->clean_result = MOUNT_FAILURE_RESOURCES; + m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source); + return r; +} + +static int mount_can_clean(Unit *u, ExecCleanMask *ret) { + Mount *m = MOUNT(u); + + assert(m); + + return exec_context_get_clean_mask(&m->exec_context, ret); +} + +static int mount_can_start(Unit *u) { + Mount *m = MOUNT(u); + int r; + + assert(m); + + r = unit_test_start_limit(u); + if (r < 0) { + mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT); + return r; + } + + return 1; +} + +static int mount_subsystem_ratelimited(Manager *m) { + assert(m); + + if (!m->mount_event_source) + return false; + + return sd_event_source_is_ratelimited(m->mount_event_source); +} + +char* mount_get_what_escaped(const Mount *m) { + _cleanup_free_ char *escaped = NULL; + const char *s = NULL; + + assert(m); + + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.what) + s = m->parameters_proc_self_mountinfo.what; + else if (m->from_fragment && m->parameters_fragment.what) + s = m->parameters_fragment.what; + + if (s) { + escaped = utf8_escape_invalid(s); + if (!escaped) + return NULL; + } + + return escaped ? TAKE_PTR(escaped) : strdup(""); +} + +char* mount_get_options_escaped(const Mount *m) { + _cleanup_free_ char *escaped = NULL; + const char *s = NULL; + + assert(m); + + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.options) + s = m->parameters_proc_self_mountinfo.options; + else if (m->from_fragment && m->parameters_fragment.options) + s = m->parameters_fragment.options; + + if (s) { + escaped = utf8_escape_invalid(s); + if (!escaped) + return NULL; + } + + return escaped ? TAKE_PTR(escaped) : strdup(""); +} + +const char* mount_get_fstype(const Mount *m) { + assert(m); + + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.fstype) + return m->parameters_proc_self_mountinfo.fstype; + + if (m->from_fragment && m->parameters_fragment.fstype) + return m->parameters_fragment.fstype; + + return NULL; +} + +static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = { + [MOUNT_EXEC_MOUNT] = "ExecMount", + [MOUNT_EXEC_UNMOUNT] = "ExecUnmount", + [MOUNT_EXEC_REMOUNT] = "ExecRemount", +}; + +DEFINE_STRING_TABLE_LOOKUP(mount_exec_command, MountExecCommand); + +static const char* const mount_result_table[_MOUNT_RESULT_MAX] = { + [MOUNT_SUCCESS] = "success", + [MOUNT_FAILURE_RESOURCES] = "resources", + [MOUNT_FAILURE_TIMEOUT] = "timeout", + [MOUNT_FAILURE_EXIT_CODE] = "exit-code", + [MOUNT_FAILURE_SIGNAL] = "signal", + [MOUNT_FAILURE_CORE_DUMP] = "core-dump", + [MOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [MOUNT_FAILURE_PROTOCOL] = "protocol", +}; + +DEFINE_STRING_TABLE_LOOKUP(mount_result, MountResult); + +const UnitVTable mount_vtable = { + .object_size = sizeof(Mount), + .exec_context_offset = offsetof(Mount, exec_context), + .cgroup_context_offset = offsetof(Mount, cgroup_context), + .kill_context_offset = offsetof(Mount, kill_context), + .exec_runtime_offset = offsetof(Mount, exec_runtime), + + .sections = + "Unit\0" + "Mount\0" + "Install\0", + .private_section = "Mount", + + .can_transient = true, + .can_fail = true, + .exclude_from_switch_root_serialization = true, + + .init = mount_init, + .load = mount_load, + .done = mount_done, + + .coldplug = mount_coldplug, + .catchup = mount_catchup, + + .dump = mount_dump, + + .start = mount_start, + .stop = mount_stop, + .reload = mount_reload, + + .clean = mount_clean, + .can_clean = mount_can_clean, + + .serialize = mount_serialize, + .deserialize_item = mount_deserialize_item, + + .active_state = mount_active_state, + .sub_state_to_string = mount_sub_state_to_string, + + .will_restart = unit_will_restart_default, + + .may_gc = mount_may_gc, + .is_extrinsic = mount_is_extrinsic, + + .sigchld_event = mount_sigchld_event, + + .reset_failed = mount_reset_failed, + + .control_pid = mount_control_pid, + + .bus_set_property = bus_mount_set_property, + .bus_commit_properties = bus_mount_commit_properties, + + .get_timeout = mount_get_timeout, + + .enumerate_perpetual = mount_enumerate_perpetual, + .enumerate = mount_enumerate, + .shutdown = mount_shutdown, + .subsystem_ratelimited = mount_subsystem_ratelimited, + + .status_message_formats = { + .starting_stopping = { + [0] = "Mounting %s...", + [1] = "Unmounting %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Mounted %s.", + [JOB_FAILED] = "Failed to mount %s.", + [JOB_TIMEOUT] = "Timed out mounting %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Unmounted %s.", + [JOB_FAILED] = "Failed unmounting %s.", + [JOB_TIMEOUT] = "Timed out unmounting %s.", + }, + }, + + .can_start = mount_can_start, + + .notify_plymouth = true, +}; diff --git a/src/core/mount.h b/src/core/mount.h new file mode 100644 index 0000000..6712c16 --- /dev/null +++ b/src/core/mount.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Mount Mount; + +#include "dynamic-user.h" +#include "kill.h" +#include "pidref.h" +#include "unit.h" + +typedef enum MountExecCommand { + MOUNT_EXEC_MOUNT, + MOUNT_EXEC_UNMOUNT, + MOUNT_EXEC_REMOUNT, + _MOUNT_EXEC_COMMAND_MAX, + _MOUNT_EXEC_COMMAND_INVALID = -EINVAL, +} MountExecCommand; + +typedef enum MountResult { + MOUNT_SUCCESS, + MOUNT_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */ + MOUNT_FAILURE_TIMEOUT, + MOUNT_FAILURE_EXIT_CODE, + MOUNT_FAILURE_SIGNAL, + MOUNT_FAILURE_CORE_DUMP, + MOUNT_FAILURE_START_LIMIT_HIT, + MOUNT_FAILURE_PROTOCOL, + _MOUNT_RESULT_MAX, + _MOUNT_RESULT_INVALID = -EINVAL, +} MountResult; + +typedef struct MountParameters { + char *what; + char *options; + char *fstype; +} MountParameters; + +/* Used while looking for mount points that vanished or got added from/to /proc/self/mountinfo */ +typedef enum MountProcFlags { + MOUNT_PROC_IS_MOUNTED = 1 << 0, + MOUNT_PROC_JUST_MOUNTED = 1 << 1, + MOUNT_PROC_JUST_CHANGED = 1 << 2, +} MountProcFlags; + +struct Mount { + Unit meta; + + char *where; + + MountParameters parameters_proc_self_mountinfo; + MountParameters parameters_fragment; + + bool invalidated_state:1; /* Set when the 'state' of the mount unit may be outdated, and we need to + * re-read /proc/self/mountinfo. */ + bool from_proc_self_mountinfo:1; + bool from_fragment:1; + + MountProcFlags proc_flags; + + bool sloppy_options; + + bool lazy_unmount; + bool force_unmount; + + bool read_write_only; + + MountResult result; + MountResult reload_result; + MountResult clean_result; + + mode_t directory_mode; + + usec_t timeout_usec; + + ExecCommand exec_command[_MOUNT_EXEC_COMMAND_MAX]; + + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + + MountState state, deserialized_state; + + ExecCommand* control_command; + MountExecCommand control_command_id; + PidRef control_pid; + + sd_event_source *timer_event_source; + + unsigned n_retry_umount; +}; + +extern const UnitVTable mount_vtable; + +void mount_fd_event(Manager *m, int events); + +int mount_invalidate_state_by_path(Manager *manager, const char *path); + +char* mount_get_what_escaped(const Mount *m); +char* mount_get_options_escaped(const Mount *m); +const char* mount_get_fstype(const Mount *m); + +const char* mount_exec_command_to_string(MountExecCommand i) _const_; +MountExecCommand mount_exec_command_from_string(const char *s) _pure_; + +const char* mount_result_to_string(MountResult i) _const_; +MountResult mount_result_from_string(const char *s) _pure_; + +DEFINE_CAST(MOUNT, Mount); diff --git a/src/core/namespace.c b/src/core/namespace.c new file mode 100644 index 0000000..88681aa --- /dev/null +++ b/src/core/namespace.c @@ -0,0 +1,3047 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#if WANT_LINUX_FS_H +#include +#endif + +#include "alloc-util.h" +#include "base-filesystem.h" +#include "chase.h" +#include "dev-setup.h" +#include "devnum-util.h" +#include "env-util.h" +#include "escape.h" +#include "extension-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "glyph-util.h" +#include "label-util.h" +#include "list.h" +#include "lock-util.h" +#include "loop-util.h" +#include "loopback-setup.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "namespace.h" +#include "nsflags.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "path-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "user-util.h" + +#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC) + +typedef enum MountMode { + /* This is ordered by priority! */ + MOUNT_INACCESSIBLE, + MOUNT_OVERLAY, + MOUNT_IMAGE, + MOUNT_BIND, + MOUNT_BIND_RECURSIVE, + MOUNT_PRIVATE_TMP, + MOUNT_PRIVATE_TMP_READ_ONLY, + MOUNT_PRIVATE_DEV, + MOUNT_BIND_DEV, + MOUNT_EMPTY_DIR, + MOUNT_PRIVATE_SYSFS, + MOUNT_BIND_SYSFS, + MOUNT_PROCFS, + MOUNT_READ_ONLY, + MOUNT_READ_WRITE, + MOUNT_NOEXEC, + MOUNT_EXEC, + MOUNT_TMPFS, + MOUNT_RUN, + MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */ + MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */ + MOUNT_MQUEUEFS, + MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */ + _MOUNT_MODE_MAX, + _MOUNT_MODE_INVALID = -EINVAL, +} MountMode; + +typedef enum MountEntryState { + MOUNT_PENDING, + MOUNT_APPLIED, + MOUNT_SKIPPED, + _MOUNT_ENTRY_STATE_MAX, + _MOUNT_ENTRY_STATE_INVALID = -EINVAL, +} MountEntryState; + +typedef struct MountEntry { + const char *path_const; /* Memory allocated on stack or static */ + MountMode mode; + bool ignore:1; /* Ignore if path does not exist? */ + bool has_prefix:1; /* Already is prefixed by the root dir? */ + bool read_only:1; /* Shall this mount point be read-only? */ + bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */ + bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */ + bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */ + MountEntryState state; /* Whether it was already processed or skipped */ + char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */ + const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */ + char *unprefixed_path_malloc; + const char *source_const; /* The source path, for bind mounts or images */ + char *source_malloc; + const char *options_const;/* Mount options for tmpfs */ + char *options_malloc; + unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */ + unsigned n_followed; + LIST_HEAD(MountOptions, image_options_const); + char **overlay_layers; +} MountEntry; + +typedef struct MountList { + MountEntry *mounts; + size_t n_mounts; +} MountList; + +/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted + * something there already. These mounts are hence overridden by any other explicitly configured mounts. */ +static const MountEntry apivfs_table[] = { + { "/proc", MOUNT_PROCFS, false }, + { "/dev", MOUNT_BIND_DEV, false }, + { "/sys", MOUNT_BIND_SYSFS, false }, + { "/run", MOUNT_RUN, false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME }, +}; + +/* ProtectKernelTunables= option and the related filesystem APIs */ +static const MountEntry protect_kernel_tunables_proc_table[] = { + { "/proc/acpi", MOUNT_READ_ONLY, true }, + { "/proc/apm", MOUNT_READ_ONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */ + { "/proc/asound", MOUNT_READ_ONLY, true }, + { "/proc/bus", MOUNT_READ_ONLY, true }, + { "/proc/fs", MOUNT_READ_ONLY, true }, + { "/proc/irq", MOUNT_READ_ONLY, true }, + { "/proc/kallsyms", MOUNT_INACCESSIBLE, true }, + { "/proc/kcore", MOUNT_INACCESSIBLE, true }, + { "/proc/latency_stats", MOUNT_READ_ONLY, true }, + { "/proc/mtrr", MOUNT_READ_ONLY, true }, + { "/proc/scsi", MOUNT_READ_ONLY, true }, + { "/proc/sys", MOUNT_READ_ONLY, true }, + { "/proc/sysrq-trigger", MOUNT_READ_ONLY, true }, + { "/proc/timer_stats", MOUNT_READ_ONLY, true }, +}; + +static const MountEntry protect_kernel_tunables_sys_table[] = { + { "/sys", MOUNT_READ_ONLY, false }, + { "/sys/fs/bpf", MOUNT_READ_ONLY, true }, + { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */ + { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true }, + { "/sys/kernel/debug", MOUNT_READ_ONLY, true }, + { "/sys/kernel/tracing", MOUNT_READ_ONLY, true }, +}; + +/* ProtectKernelModules= option */ +static const MountEntry protect_kernel_modules_table[] = { + { "/usr/lib/modules", MOUNT_INACCESSIBLE, true }, +}; + +/* ProtectKernelLogs= option */ +static const MountEntry protect_kernel_logs_proc_table[] = { + { "/proc/kmsg", MOUNT_INACCESSIBLE, true }, +}; + +static const MountEntry protect_kernel_logs_dev_table[] = { + { "/dev/kmsg", MOUNT_INACCESSIBLE, true }, +}; + +/* + * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of + * system should be protected by ProtectSystem= + */ +static const MountEntry protect_home_read_only_table[] = { + { "/home", MOUNT_READ_ONLY, true }, + { "/run/user", MOUNT_READ_ONLY, true }, + { "/root", MOUNT_READ_ONLY, true }, +}; + +/* ProtectHome=tmpfs table */ +static const MountEntry protect_home_tmpfs_table[] = { + { "/home", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, + { "/run/user", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, + { "/root", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, +}; + +/* ProtectHome=yes table */ +static const MountEntry protect_home_yes_table[] = { + { "/home", MOUNT_INACCESSIBLE, true }, + { "/run/user", MOUNT_INACCESSIBLE, true }, + { "/root", MOUNT_INACCESSIBLE, true }, +}; + +/* ProtectSystem=yes table */ +static const MountEntry protect_system_yes_table[] = { + { "/usr", MOUNT_READ_ONLY, false }, + { "/boot", MOUNT_READ_ONLY, true }, + { "/efi", MOUNT_READ_ONLY, true }, +}; + +/* ProtectSystem=full includes ProtectSystem=yes */ +static const MountEntry protect_system_full_table[] = { + { "/usr", MOUNT_READ_ONLY, false }, + { "/boot", MOUNT_READ_ONLY, true }, + { "/efi", MOUNT_READ_ONLY, true }, + { "/etc", MOUNT_READ_ONLY, false }, +}; + +/* ProtectSystem=strict table. In this strict mode, we mount everything read-only, except for /proc, /dev, + * /sys which are the kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. (And of course /home and friends are also + * left writable, as ProtectHome= shall manage those, orthogonally). + */ +static const MountEntry protect_system_strict_table[] = { + { "/", MOUNT_READ_ONLY, false }, + { "/proc", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */ + { "/sys", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */ + { "/dev", MOUNT_READ_WRITE_IMPLICIT, false }, /* PrivateDevices= */ + { "/home", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */ + { "/run/user", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */ + { "/root", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */ +}; + +/* ProtectHostname=yes able */ +static const MountEntry protect_hostname_table[] = { + { "/proc/sys/kernel/hostname", MOUNT_READ_ONLY, false }, + { "/proc/sys/kernel/domainname", MOUNT_READ_ONLY, false }, +}; + +static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { + [MOUNT_INACCESSIBLE] = "inaccessible", + [MOUNT_OVERLAY] = "overlay", + [MOUNT_IMAGE] = "image", + [MOUNT_BIND] = "bind", + [MOUNT_BIND_RECURSIVE] = "bind-recursive", + [MOUNT_PRIVATE_TMP] = "private-tmp", + [MOUNT_PRIVATE_TMP_READ_ONLY] = "private-tmp-read-only", + [MOUNT_PRIVATE_DEV] = "private-dev", + [MOUNT_BIND_DEV] = "bind-dev", + [MOUNT_EMPTY_DIR] = "empty-dir", + [MOUNT_PRIVATE_SYSFS] = "private-sysfs", + [MOUNT_BIND_SYSFS] = "bind-sysfs", + [MOUNT_PROCFS] = "procfs", + [MOUNT_READ_ONLY] = "read-only", + [MOUNT_READ_WRITE] = "read-write", + [MOUNT_NOEXEC] = "noexec", + [MOUNT_EXEC] = "exec", + [MOUNT_TMPFS] = "tmpfs", + [MOUNT_RUN] = "run", + [MOUNT_EXTENSION_DIRECTORY] = "extension-directory", + [MOUNT_EXTENSION_IMAGE] = "extension-image", + [MOUNT_MQUEUEFS] = "mqueuefs", + [MOUNT_READ_WRITE_IMPLICIT] = "read-write-implicit", +}; + +/* Helper struct for naming simplicity and reusability */ +static const struct { + const char *level_env; + const char *level_env_print; +} image_class_info[_IMAGE_CLASS_MAX] = { + [IMAGE_SYSEXT] = { + .level_env = "SYSEXT_LEVEL", + .level_env_print = " SYSEXT_LEVEL=", + }, + [IMAGE_CONFEXT] = { + .level_env = "CONFEXT_LEVEL", + .level_env_print = " CONFEXT_LEVEL=", + } +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode); + +static const char *mount_entry_path(const MountEntry *p) { + assert(p); + + /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that, + * otherwise the stack/static ->path field is returned. */ + + return p->path_malloc ?: p->path_const; +} + +static const char *mount_entry_unprefixed_path(const MountEntry *p) { + assert(p); + + /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */ + + return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p); +} + +static void mount_entry_consume_prefix(MountEntry *p, char *new_path) { + assert(p); + assert(p->path_malloc || p->path_const); + assert(new_path); + + /* Saves current path in unprefixed_ variable, and takes over new_path */ + + free_and_replace(p->unprefixed_path_malloc, p->path_malloc); + /* If we didn't have a path on the heap, then it's a static one */ + if (!p->unprefixed_path_malloc) + p->unprefixed_path_const = p->path_const; + p->path_malloc = new_path; + p->has_prefix = true; +} + +static bool mount_entry_read_only(const MountEntry *p) { + assert(p); + + return p->read_only || IN_SET(p->mode, MOUNT_READ_ONLY, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_TMP_READ_ONLY); +} + +static bool mount_entry_noexec(const MountEntry *p) { + assert(p); + + return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS); +} + +static bool mount_entry_exec(const MountEntry *p) { + assert(p); + + return p->exec || p->mode == MOUNT_EXEC; +} + +static const char *mount_entry_source(const MountEntry *p) { + assert(p); + + return p->source_malloc ?: p->source_const; +} + +static const char *mount_entry_options(const MountEntry *p) { + assert(p); + + return p->options_malloc ?: p->options_const; +} + +static void mount_entry_done(MountEntry *p) { + assert(p); + + p->path_malloc = mfree(p->path_malloc); + p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc); + p->source_malloc = mfree(p->source_malloc); + p->options_malloc = mfree(p->options_malloc); + p->overlay_layers = strv_free(p->overlay_layers); +} + +static void mount_list_done(MountList *ml) { + assert(ml); + + FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) + mount_entry_done(m); + + ml->mounts = mfree(ml->mounts); + ml->n_mounts = 0; +} + +static MountEntry *mount_list_extend(MountList *ml) { + assert(ml); + + if (!GREEDY_REALLOC0(ml->mounts, ml->n_mounts+1)) + return NULL; + + return ml->mounts + ml->n_mounts++; +} + +static int append_access_mounts(MountList *ml, char **strv, MountMode mode, bool forcibly_require_prefix) { + assert(ml); + + /* Adds a list of user-supplied READ_WRITE/READ_WRITE_IMPLICIT/READ_ONLY/INACCESSIBLE entries */ + + STRV_FOREACH(i, strv) { + bool ignore = false, needs_prefix = false; + const char *e = *i; + + /* Look for any prefixes */ + if (startswith(e, "-")) { + e++; + ignore = true; + } + if (startswith(e, "+")) { + e++; + needs_prefix = true; + } + + if (!path_is_absolute(e)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", e); + + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = e, + .mode = mode, + .ignore = ignore, + .has_prefix = !needs_prefix && !forcibly_require_prefix, + }; + } + + return 0; +} + +static int append_empty_dir_mounts(MountList *ml, char **strv) { + assert(ml); + + /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the + * "/private/" boundary directories for DynamicUser=1. */ + + STRV_FOREACH(i, strv) { + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = *i, + .mode = MOUNT_EMPTY_DIR, + .ignore = false, + .read_only = true, + .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + }; + } + + return 0; +} + +static int append_bind_mounts(MountList *ml, const BindMount *binds, size_t n) { + assert(ml); + assert(binds || n == 0); + + FOREACH_ARRAY(b, binds, n) { + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = b->destination, + .mode = b->recursive ? MOUNT_BIND_RECURSIVE : MOUNT_BIND, + .read_only = b->read_only, + .nosuid = b->nosuid, + .source_const = b->source, + .ignore = b->ignore_enoent, + }; + } + + return 0; +} + +static int append_mount_images(MountList *ml, const MountImage *mount_images, size_t n) { + assert(ml); + assert(mount_images || n == 0); + + FOREACH_ARRAY(m, mount_images, n) { + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = m->destination, + .mode = MOUNT_IMAGE, + .source_const = m->source, + .image_options_const = m->mount_options, + .ignore = m->ignore_enoent, + }; + } + + return 0; +} + +static int append_extensions( + MountList *ml, + const char *root, + const char *extension_dir, + char **hierarchies, + const MountImage *mount_images, + size_t n, + char **extension_directories) { + + char ***overlays = NULL; + size_t n_overlays = 0; + int r; + + assert(ml); + + if (n == 0 && strv_isempty(extension_directories)) + return 0; + + assert(extension_dir); + + n_overlays = strv_length(hierarchies); + if (n_overlays == 0) + return 0; + + /* Prepare a list of overlays, that will have as each element a strv containing all the layers that + * will later be concatenated as a lowerdir= parameter for the mount operation. + * The overlays vector will have the same number of elements and will correspond to the + * hierarchies vector, so they can be iterated upon together. */ + overlays = new0(char**, n_overlays); + if (!overlays) + return -ENOMEM; + + CLEANUP_ARRAY(overlays, n_overlays, strv_free_many); + + /* First, prepare a mount for each image, but these won't be visible to the unit, instead + * they will be mounted in our propagate directory, and used as a source for the overlay. */ + for (size_t i = 0; i < n; i++) { + _cleanup_free_ char *mount_point = NULL; + const MountImage *m = mount_images + i; + + if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0) + return -ENOMEM; + + for (size_t j = 0; hierarchies && hierarchies[j]; ++j) { + char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]); + if (!prefixed_hierarchy) + return -ENOMEM; + + r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy)); + if (r < 0) + return r; + } + + MountEntry *me = mount_list_extend(ml); + if (!me) + return -ENOMEM; + + *me = (MountEntry) { + .path_malloc = TAKE_PTR(mount_point), + .image_options_const = m->mount_options, + .ignore = m->ignore_enoent, + .source_const = m->source, + .mode = MOUNT_EXTENSION_IMAGE, + .has_prefix = true, + }; + } + + /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory. + * Bind mount them in the same location as the ExtensionImages, so that we + * can check that they are valid trees (extension-release.d). */ + STRV_FOREACH(extension_directory, extension_directories) { + _cleanup_free_ char *mount_point = NULL, *source = NULL; + const char *e = *extension_directory; + bool ignore_enoent = false; + + /* Pick up the counter where the ExtensionImages left it. */ + if (asprintf(&mount_point, "%s/%zu", extension_dir, n++) < 0) + return -ENOMEM; + + /* Look for any prefixes */ + if (startswith(e, "-")) { + e++; + ignore_enoent = true; + } + /* Ignore this for now */ + if (startswith(e, "+")) + e++; + + source = strdup(e); + if (!source) + return -ENOMEM; + + for (size_t j = 0; hierarchies && hierarchies[j]; ++j) { + char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]); + if (!prefixed_hierarchy) + return -ENOMEM; + + r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy)); + if (r < 0) + return r; + } + + MountEntry *me = mount_list_extend(ml); + if (!me) + return -ENOMEM; + + *me = (MountEntry) { + .path_malloc = TAKE_PTR(mount_point), + .source_malloc = TAKE_PTR(source), + .mode = MOUNT_EXTENSION_DIRECTORY, + .ignore = ignore_enoent, + .has_prefix = true, + .read_only = true, + }; + } + + /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings + * set up earlier. */ + for (size_t i = 0; hierarchies && hierarchies[i]; ++i) { + _cleanup_free_ char *prefixed_hierarchy = NULL; + + prefixed_hierarchy = path_join(root, hierarchies[i]); + if (!prefixed_hierarchy) + return -ENOMEM; + + MountEntry *me = mount_list_extend(ml); + if (!me) + return -ENOMEM; + + *me = (MountEntry) { + .path_malloc = TAKE_PTR(prefixed_hierarchy), + .overlay_layers = TAKE_PTR(overlays[i]), + .mode = MOUNT_OVERLAY, + .has_prefix = true, + .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */ + }; + } + + return 0; +} + +static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, size_t n) { + assert(ml); + assert(tmpfs || n == 0); + + FOREACH_ARRAY(t, tmpfs, n) { + _cleanup_free_ char *o = NULL, *str = NULL; + unsigned long flags; + bool ro = false; + int r; + + if (!path_is_absolute(t->path)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", t->path); + + str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options); + if (!str) + return -ENOMEM; + + r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o); + if (r < 0) + return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); + + ro = flags & MS_RDONLY; + if (ro) + flags ^= MS_RDONLY; + + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = t->path, + .mode = MOUNT_TMPFS, + .read_only = ro, + .options_malloc = TAKE_PTR(o), + .flags = flags, + }; + } + + return 0; +} + +static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t n, bool ignore_protect) { + assert(ml); + assert(mounts || n == 0); + + /* Adds a list of static pre-defined entries */ + + FOREACH_ARRAY(m, mounts, n) { + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = mount_entry_path(m), + .mode = m->mode, + .ignore = m->ignore || ignore_protect, + }; + } + + return 0; +} + +static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) { + assert(ml); + + switch (protect_home) { + + case PROTECT_HOME_NO: + return 0; + + case PROTECT_HOME_READ_ONLY: + return append_static_mounts(ml, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect); + + case PROTECT_HOME_TMPFS: + return append_static_mounts(ml, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect); + + case PROTECT_HOME_YES: + return append_static_mounts(ml, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect); + + default: + assert_not_reached(); + } +} + +static int append_protect_system(MountList *ml, ProtectSystem protect_system, bool ignore_protect) { + assert(ml); + + switch (protect_system) { + + case PROTECT_SYSTEM_NO: + return 0; + + case PROTECT_SYSTEM_STRICT: + return append_static_mounts(ml, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect); + + case PROTECT_SYSTEM_YES: + return append_static_mounts(ml, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect); + + case PROTECT_SYSTEM_FULL: + return append_static_mounts(ml, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect); + + default: + assert_not_reached(); + } +} + +static int mount_path_compare(const MountEntry *a, const MountEntry *b) { + int d; + + /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first + * regardless of the prefix - they are set up in the propagate directory anyway */ + d = -CMP(a->mode == MOUNT_EXTENSION_IMAGE, b->mode == MOUNT_EXTENSION_IMAGE); + if (d != 0) + return d; + d = -CMP(a->mode == MOUNT_EXTENSION_DIRECTORY, b->mode == MOUNT_EXTENSION_DIRECTORY); + if (d != 0) + return d; + + /* If the paths are not equal, then order prefixes first */ + d = path_compare(mount_entry_path(a), mount_entry_path(b)); + if (d != 0) + return d; + + /* If the paths are equal, check the mode */ + return CMP((int) a->mode, (int) b->mode); +} + +static int prefix_where_needed(MountList *ml, const char *root_directory) { + /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */ + + assert(ml); + + FOREACH_ARRAY(me, ml->mounts, ml->n_mounts) { + char *s; + + if (me->has_prefix) + continue; + + s = path_join(root_directory, mount_entry_path(me)); + if (!s) + return -ENOMEM; + + mount_entry_consume_prefix(me, s); + } + + return 0; +} + +static void drop_duplicates(MountList *ml) { + MountEntry *f, *t, *previous; + + assert(ml); + + /* Drops duplicate entries. Expects that the array is properly ordered already. */ + + for (f = ml->mounts, t = ml->mounts, previous = NULL; f < ml->mounts + ml->n_mounts; f++) { + + /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare() + * above. Note that we only drop duplicates that haven't been mounted yet. */ + if (previous && + path_equal(mount_entry_path(f), mount_entry_path(previous)) && + f->state == MOUNT_PENDING && previous->state == MOUNT_PENDING) { + log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode)); + /* Propagate the flags to the remaining entry */ + previous->read_only = previous->read_only || mount_entry_read_only(f); + previous->noexec = previous->noexec || mount_entry_noexec(f); + previous->exec = previous->exec || mount_entry_exec(f); + mount_entry_done(f); + continue; + } + + *t = *f; + previous = t; + t++; + } + + ml->n_mounts = t - ml->mounts; +} + +static void drop_inaccessible(MountList *ml) { + MountEntry *f, *t; + const char *clear = NULL; + + assert(ml); + + /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly + * ordered already. */ + + for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) { + + /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop + * it, as inaccessible paths really should drop the entire subtree. */ + if (clear && path_startswith(mount_entry_path(f), clear)) { + log_debug("%s is masked by %s.", mount_entry_path(f), clear); + mount_entry_done(f); + continue; + } + + clear = f->mode == MOUNT_INACCESSIBLE ? mount_entry_path(f) : NULL; + + *t = *f; + t++; + } + + ml->n_mounts = t - ml->mounts; +} + +static void drop_nop(MountList *ml) { + MountEntry *f, *t; + + assert(ml); + + /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the + * list is ordered by prefixes. */ + + for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) { + + /* Only suppress such subtrees for READ_ONLY, READ_WRITE and READ_WRITE_IMPLICIT entries */ + if (IN_SET(f->mode, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_READ_WRITE_IMPLICIT)) { + MountEntry *found = NULL; + + /* Now let's find the first parent of the entry we are looking at. */ + for (MountEntry *p = PTR_SUB1(t, ml->mounts); p; p = PTR_SUB1(p, ml->mounts)) + if (path_startswith(mount_entry_path(f), mount_entry_path(p))) { + found = p; + break; + } + + /* We found it, let's see if it's the same mode, if so, we can drop this entry */ + if (found && found->mode == f->mode) { + log_debug("%s (%s) is made redundant by %s (%s)", + mount_entry_path(f), mount_mode_to_string(f->mode), + mount_entry_path(found), mount_mode_to_string(found->mode)); + mount_entry_done(f); + continue; + } + } + + *t = *f; + t++; + } + + ml->n_mounts = t - ml->mounts; +} + +static void drop_outside_root(MountList *ml, const char *root_directory) { + MountEntry *f, *t; + + assert(ml); + + /* Nothing to do */ + if (!root_directory) + return; + + /* Drops all mounts that are outside of the root directory. */ + + for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) { + + /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */ + if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) && !path_startswith(mount_entry_path(f), root_directory)) { + log_debug("%s is outside of root directory.", mount_entry_path(f)); + mount_entry_done(f); + continue; + } + + *t = *f; + t++; + } + + ml->n_mounts = t - ml->mounts; +} + +static int clone_device_node( + const char *d, + const char *temporary_mount, + bool *make_devnode) { + + _cleanup_free_ char *sl = NULL; + const char *dn, *bn, *t; + struct stat st; + int r; + + if (stat(d, &st) < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d); + return -ENXIO; + } + + return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d); + } + + if (!S_ISBLK(st.st_mode) && + !S_ISCHR(st.st_mode)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Device node '%s' to clone is not a device node, ignoring.", + d); + + dn = strjoina(temporary_mount, d); + + /* First, try to create device node properly */ + if (*make_devnode) { + mac_selinux_create_file_prepare(d, st.st_mode); + r = mknod(dn, st.st_mode, st.st_rdev); + mac_selinux_create_file_clear(); + if (r >= 0) + goto add_symlink; + if (errno != EPERM) + return log_debug_errno(errno, "mknod failed for %s: %m", d); + + /* This didn't work, let's not try this again for the next iterations. */ + *make_devnode = false; + } + + /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target. + * Do not prepare device-node SELinux label (see issue 13762) */ + r = mknod(dn, S_IFREG, 0); + if (r < 0 && errno != EEXIST) + return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d); + + /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard + * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or + * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */ + r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL); + if (r < 0) + return r; + +add_symlink: + bn = path_startswith(d, "/dev/"); + if (!bn) + return 0; + + /* Create symlinks like /dev/char/1:9 → ../urandom */ + if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR, + temporary_mount, + S_ISCHR(st.st_mode) ? "char" : "block", + DEVNUM_FORMAT_VAL(st.st_rdev)) < 0) + return log_oom_debug(); + + (void) mkdir_parents(sl, 0755); + + t = strjoina("../", bn); + if (symlink(t, sl) < 0) + log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl); + + return 0; +} + +static char *settle_runtime_dir(RuntimeScope scope) { + char *runtime_dir; + + if (scope != RUNTIME_SCOPE_USER) + return strdup("/run/"); + + if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0) + return NULL; + + return runtime_dir; +} + +static int create_temporary_mount_point(RuntimeScope scope, char **ret) { + _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL; + + assert(ret); + + runtime_dir = settle_runtime_dir(scope); + if (!runtime_dir) + return log_oom_debug(); + + temporary_mount = path_join(runtime_dir, "systemd/namespace-XXXXXX"); + if (!temporary_mount) + return log_oom_debug(); + + if (!mkdtemp(temporary_mount)) + return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount); + + *ret = TAKE_PTR(temporary_mount); + return 0; +} + +static int mount_private_dev(MountEntry *m, RuntimeScope scope) { + static const char devnodes[] = + "/dev/null\0" + "/dev/zero\0" + "/dev/full\0" + "/dev/random\0" + "/dev/urandom\0" + "/dev/tty\0"; + + _cleanup_free_ char *temporary_mount = NULL; + const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL; + bool can_mknod = true; + int r; + + assert(m); + + r = create_temporary_mount_point(scope, &temporary_mount); + if (r < 0) + return r; + + dev = strjoina(temporary_mount, "/dev"); + (void) mkdir(dev, 0755); + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV); + if (r < 0) + goto fail; + + r = label_fix_full(AT_FDCWD, dev, "/dev", 0); + if (r < 0) { + log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev); + goto fail; + } + + devpts = strjoina(temporary_mount, "/dev/pts"); + (void) mkdir(devpts, 0755); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL); + if (r < 0) + goto fail; + + /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx. + * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible. + * Thus, in that case make a clone. + * In nspawn and other containers it will be a symlink, in that case make it a symlink. */ + r = is_symlink("/dev/ptmx"); + if (r < 0) { + log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m"); + goto fail; + } else if (r > 0) { + devptmx = strjoina(temporary_mount, "/dev/ptmx"); + if (symlink("pts/ptmx", devptmx) < 0) { + r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx); + goto fail; + } + } else { + r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod); + if (r < 0) + goto fail; + } + + devshm = strjoina(temporary_mount, "/dev/shm"); + (void) mkdir(devshm, 0755); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL); + if (r < 0) + goto fail; + + devmqueue = strjoina(temporary_mount, "/dev/mqueue"); + (void) mkdir(devmqueue, 0755); + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL); + + devhugepages = strjoina(temporary_mount, "/dev/hugepages"); + (void) mkdir(devhugepages, 0755); + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL); + + devlog = strjoina(temporary_mount, "/dev/log"); + if (symlink("/run/systemd/journal/dev-log", devlog) < 0) + log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog); + + NULSTR_FOREACH(d, devnodes) { + r = clone_device_node(d, temporary_mount, &can_mknod); + /* ENXIO means the *source* is not a device file, skip creation in that case */ + if (r < 0 && r != -ENXIO) + goto fail; + } + + r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID); + if (r < 0) + log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount); + + /* Create the /dev directory if missing. It is more likely to be missing when the service is started + * with RootDirectory. This is consistent with mount units creating the mount points when missing. */ + (void) mkdir_p_label(mount_entry_path(m), 0755); + + /* Unmount everything in old /dev */ + r = umount_recursive(mount_entry_path(m), 0); + if (r < 0) + log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m)); + + r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL); + if (r < 0) + goto fail; + + (void) rmdir(dev); + (void) rmdir(temporary_mount); + + return 1; + +fail: + if (devpts) + (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW); + + if (devshm) + (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW); + + if (devhugepages) + (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW); + + if (devmqueue) + (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW); + + (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW); + (void) rmdir(dev); + (void) rmdir(temporary_mount); + + return r; +} + +static int mount_bind_dev(const MountEntry *m) { + int r; + + assert(m); + + /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the + * service's /dev. This is only used when RootDirectory= is set. */ + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); + if (r > 0) /* make this a NOP if /dev is already a mount point */ + return 0; + + r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + return 1; +} + +static int mount_bind_sysfs(const MountEntry *m) { + int r; + + assert(m); + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); + if (r > 0) /* make this a NOP if /sys is already a mount point */ + return 0; + + /* Bind mount the host's version so that we get all child mounts of it, too. */ + r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + return 1; +} + +static int mount_private_apivfs( + const char *fstype, + const char *entry_path, + const char *bind_source, + const char *opts, + RuntimeScope scope) { + + _cleanup_(rmdir_and_freep) char *temporary_mount = NULL; + int r; + + assert(fstype); + assert(entry_path); + assert(bind_source); + + (void) mkdir_p_label(entry_path, 0755); + + /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance + * cannot be mounted on an already existing mount. Let's use a temporary place. */ + r = create_temporary_mount_point(scope, &temporary_mount); + if (r < 0) + return r; + + r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r == -EINVAL && opts) + /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is + * not supported by the kernel, and thus the per-instance hidepid= neither, which means we + * really don't want to use it, since it would affect our host's /proc mount. Hence let's + * gracefully fallback to a classic, unrestricted version. */ + r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL); + if (ERRNO_IS_NEG_PRIVILEGE(r)) { + /* When we do not have enough privileges to mount a new instance, fall back to use an + * existing mount. */ + + r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path); + if (r > 0) + return 0; /* Use the current mount as is. */ + + /* We lack permissions to mount a new instance, and it is not already mounted. But we can + * access the host's, so as a final fallback bind-mount it to the destination, as most likely + * we are inside a user manager in an unprivileged user namespace. */ + r = mount_nofollow_verbose(LOG_DEBUG, bind_source, entry_path, /* fstype = */ NULL, MS_BIND|MS_REC, /* opts = */ NULL); + if (r < 0) + return r; + + return 1; + + } else if (r < 0) + return r; + + /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */ + r = umount_recursive(entry_path, /* flags = */ 0); + if (r < 0) + log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path); + + /* Then, move the new mount instance. */ + r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* opts = */ NULL); + if (r < 0) + return r; + + /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn + * where a bunch of files are overmounted, in particular the boot id. */ + (void) bind_mount_submounts(bind_source, entry_path); + return 1; +} + +static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) { + assert(m); + assert(p); + return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope); +} + +static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { + _cleanup_free_ char *opts = NULL; + + assert(m); + assert(p); + + if (p->protect_proc != PROTECT_PROC_DEFAULT || + p->proc_subset != PROC_SUBSET_ALL) { + + /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it + * pretended to be per-instance but actually was per-namespace), hence let's make use of it + * if requested. To make sure this logic succeeds only on kernels where hidepid= is + * per-instance, we'll exclusively use the textual value for hidepid=, since support was + * added in the same commit: if it's supported it is thus also per-instance. */ + + const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ? + "off" : + protect_proc_to_string(p->protect_proc); + + /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in + * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when + * trying to use hidepid= on systems where it isn't supported. The same applies for subset=. + * fsopen()/fsconfig() was also backported on some distros which allows us to detect + * hidepid=/subset= support in even more scenarios. */ + + if (mount_option_supported("proc", "hidepid", hpv) != 0) { + opts = strjoin("hidepid=", hpv); + if (!opts) + return -ENOMEM; + } + + if (p->proc_subset == PROC_SUBSET_PID && + mount_option_supported("proc", "subset", "pid") != 0) + if (!strextend_with_separator(&opts, ",", "subset=pid")) + return -ENOMEM; + } + + /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in + * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by + * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything + * mounted on /proc/ first. */ + return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope); +} + +static int mount_tmpfs(const MountEntry *m) { + const char *entry_path, *inner_path; + int r; + + assert(m); + + entry_path = mount_entry_path(m); + inner_path = mount_entry_unprefixed_path(m); + + /* First, get rid of everything that is below if there is anything. Then, overmount with our new + * tmpfs */ + + (void) mkdir_p_label(entry_path, 0755); + (void) umount_recursive(entry_path, 0); + + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m)); + if (r < 0) + return r; + + r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0); + if (r < 0) + return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path); + + return 1; +} + +static int mount_run(const MountEntry *m) { + int r; + + assert(m); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0 && r != -ENOENT) + return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m"); + if (r > 0) /* make this a NOP if /run is already a mount point */ + return 0; + + return mount_tmpfs(m); +} + +static int mount_mqueuefs(const MountEntry *m) { + int r; + const char *entry_path; + + assert(m); + + entry_path = mount_entry_path(m); + + (void) mkdir_p_label(entry_path, 0755); + (void) umount_recursive(entry_path, 0); + + r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m)); + if (r < 0) + return r; + + return 1; +} + +static int mount_image( + const MountEntry *m, + const char *root_directory, + const ImagePolicy *image_policy) { + + _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL, + *host_os_release_sysext_level = NULL, *host_os_release_confext_level = NULL, + *extension_name = NULL; + int r; + + assert(m); + + r = path_extract_filename(mount_entry_source(m), &extension_name); + if (r < 0) + return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m)); + + if (m->mode == MOUNT_EXTENSION_IMAGE) { + r = parse_os_release( + empty_to_root(root_directory), + "ID", &host_os_release_id, + "VERSION_ID", &host_os_release_version_id, + image_class_info[IMAGE_SYSEXT].level_env, &host_os_release_sysext_level, + image_class_info[IMAGE_CONFEXT].level_env, &host_os_release_confext_level, + NULL); + if (r < 0) + return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); + if (isempty(host_os_release_id)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); + } + + r = verity_dissect_and_mount( + /* src_fd= */ -1, + mount_entry_source(m), + mount_entry_path(m), + m->image_options_const, + image_policy, + host_os_release_id, + host_os_release_version_id, + host_os_release_sysext_level, + host_os_release_confext_level, + /* required_sysext_scope= */ NULL, + /* ret_image= */ NULL); + if (r == -ENOENT && m->ignore) + return 0; + if (r == -ESTALE && host_os_release_id) + return log_error_errno(r, // FIXME: this should not be logged ad LOG_ERR, as it will result in duplicate logging. + "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s%s%s%s%s%s%s", + mount_entry_source(m), + host_os_release_id, + host_os_release_version_id ? " VERSION_ID=" : "", + strempty(host_os_release_version_id), + host_os_release_sysext_level ? image_class_info[IMAGE_SYSEXT].level_env_print : "", + strempty(host_os_release_sysext_level), + host_os_release_confext_level ? image_class_info[IMAGE_CONFEXT].level_env_print : "", + strempty(host_os_release_confext_level)); + if (r < 0) + return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m)); + + return 1; +} + +static int mount_overlay(const MountEntry *m) { + _cleanup_free_ char *options = NULL, *layers = NULL; + int r; + + assert(m); + + /* Extension hierarchies are optional (e.g.: confext might not have /opt) so check if they actually + * exist in an image before attempting to create an overlay with them, otherwise the mount will + * fail. We can't check before this, as the images will not be mounted until now. */ + + /* Note that lowerdir= parameters are in 'reverse' order, so the top-most directory in the overlay + * comes first in the list. */ + STRV_FOREACH_BACKWARDS(o, m->overlay_layers) { + _cleanup_free_ char *escaped = NULL; + + r = is_dir(*o, /* follow= */ false); + if (r <= 0) { + if (r != -ENOENT) + log_debug_errno(r, + "Failed to check whether overlay layer source path '%s' exists, ignoring: %m", + *o); + continue; + } + + escaped = shell_escape(*o, ",:"); + if (!escaped) + return log_oom_debug(); + + if (!strextend_with_separator(&layers, ":", escaped)) + return log_oom_debug(); + } + + if (!layers) { + log_debug("None of the overlays specified in '%s' exist at the source, skipping.", + mount_entry_options(m)); + return 0; /* Only the root is set? Then there's nothing to overlay */ + } + + options = strjoin("lowerdir=", layers, ":", mount_entry_path(m)); /* The root goes in last */ + if (!options) + return log_oom_debug(); + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options); + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return r; + + return 1; +} + +static int follow_symlink( + const char *root_directory, + MountEntry *m) { + + _cleanup_free_ char *target = NULL; + int r; + + /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we + * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at + * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the + * end and already have a fully normalized name. */ + + r = chase(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m)); + if (r > 0) /* Reached the end, nothing more to resolve */ + return 1; + + if (m->n_followed >= CHASE_MAX) /* put a boundary on things */ + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "Symlink loop on '%s'.", + mount_entry_path(m)); + + log_debug("Followed mount entry path symlink %s %s %s.", + mount_entry_path(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), target); + + mount_entry_consume_prefix(m, TAKE_PTR(target)); + + m->n_followed ++; + + return 0; +} + +static int apply_one_mount( + const char *root_directory, + MountEntry *m, + const NamespaceParameters *p) { + + _cleanup_free_ char *inaccessible = NULL; + bool rbind = true, make = false; + const char *what; + int r; + + /* Return 1 when the mount should be post-processed (remounted r/o, etc.), 0 otherwise. In most + * cases post-processing is the right thing, the typical exception is when the mount is gracefully + * skipped. */ + + assert(m); + assert(p); + + log_debug("Applying namespace mount on %s", mount_entry_path(m)); + + switch (m->mode) { + + case MOUNT_INACCESSIBLE: { + _cleanup_free_ char *runtime_dir = NULL; + struct stat target; + + /* First, get rid of everything that is below if there + * is anything... Then, overmount it with an + * inaccessible path. */ + (void) umount_recursive(mount_entry_path(m), 0); + + if (lstat(mount_entry_path(m), &target) < 0) { + if (errno == ENOENT && m->ignore) + return 0; + + return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", + mount_entry_path(m)); + } + + /* We don't pass the literal runtime scope through here but one based purely on our UID. This + * means that the root user's --user services will use the host's inaccessible inodes rather + * then root's private ones. This is preferable since it means device nodes that are + * overmounted to make them inaccessible will be overmounted with a device node, rather than + * an AF_UNIX socket inode. */ + runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER); + if (!runtime_dir) + return log_oom_debug(); + + r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible); + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "File type not supported for inaccessible mounts. Note that symlinks are not allowed"); + what = inaccessible; + break; + } + + case MOUNT_READ_ONLY: + case MOUNT_READ_WRITE: + case MOUNT_READ_WRITE_IMPLICIT: + case MOUNT_EXEC: + case MOUNT_NOEXEC: + r = path_is_mount_point(mount_entry_path(m), root_directory, 0); + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", + mount_entry_path(m)); + if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY + * and MS_NOEXEC bits for the mount point if needed. */ + return 1; + /* This isn't a mount point yet, let's make it one. */ + what = mount_entry_path(m); + break; + + case MOUNT_EXTENSION_DIRECTORY: { + _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL, + *host_os_release_level = NULL, *extension_name = NULL; + _cleanup_strv_free_ char **extension_release = NULL; + ImageClass class = IMAGE_SYSEXT; + + r = path_extract_filename(mount_entry_source(m), &extension_name); + if (r < 0) + return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m)); + + r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release); + if (r == -ENOENT) { + r = load_extension_release_pairs(mount_entry_source(m), IMAGE_CONFEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release); + if (r >= 0) + class = IMAGE_CONFEXT; + } + if (r < 0) + return log_debug_errno(r, "Failed to acquire 'extension-release' data of extension tree %s: %m", mount_entry_source(m)); + + r = parse_os_release( + empty_to_root(root_directory), + "ID", &host_os_release_id, + "VERSION_ID", &host_os_release_version_id, + image_class_info[class].level_env, &host_os_release_level, + NULL); + if (r < 0) + return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); + if (isempty(host_os_release_id)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); + + r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release); + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to parse directory %s extension-release metadata: %m", extension_name); + + r = extension_release_validate( + extension_name, + host_os_release_id, + host_os_release_version_id, + host_os_release_level, + /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */ + extension_release, + class); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name); + if (r < 0) + return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name); + + _fallthrough_; + } + + case MOUNT_BIND: + rbind = false; + + _fallthrough_; + case MOUNT_BIND_RECURSIVE: { + _cleanup_free_ char *chased = NULL; + + /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note + * that bind mount source paths are always relative to the host root, hence we pass NULL as + * root directory to chase() here. */ + + r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL); + if (r == -ENOENT && m->ignore) { + log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m)); + return 0; + } + if (r < 0) + return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m)); + + log_debug("Followed source symlinks %s %s %s.", + mount_entry_source(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), chased); + + free_and_replace(m->source_malloc, chased); + + what = mount_entry_source(m); + make = true; + break; + } + + case MOUNT_EMPTY_DIR: + case MOUNT_TMPFS: + return mount_tmpfs(m); + + case MOUNT_PRIVATE_TMP: + case MOUNT_PRIVATE_TMP_READ_ONLY: + what = mount_entry_source(m); + make = true; + break; + + case MOUNT_PRIVATE_DEV: + return mount_private_dev(m, p->runtime_scope); + + case MOUNT_BIND_DEV: + return mount_bind_dev(m); + + case MOUNT_PRIVATE_SYSFS: + return mount_private_sysfs(m, p); + + case MOUNT_BIND_SYSFS: + return mount_bind_sysfs(m); + + case MOUNT_PROCFS: + return mount_procfs(m, p); + + case MOUNT_RUN: + return mount_run(m); + + case MOUNT_MQUEUEFS: + return mount_mqueuefs(m); + + case MOUNT_IMAGE: + return mount_image(m, NULL, p->mount_image_policy); + + case MOUNT_EXTENSION_IMAGE: + return mount_image(m, root_directory, p->extension_image_policy); + + case MOUNT_OVERLAY: + return mount_overlay(m); + + default: + assert_not_reached(); + } + + assert(what); + + r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL); + if (r < 0) { + bool try_again = false; + + if (r == -ENOENT && make) { + int q; + + /* Hmm, either the source or the destination are missing. Let's see if we can create + the destination, then try again. */ + + (void) mkdir_parents(mount_entry_path(m), 0755); + + q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755); + if (q < 0) { + if (q != -EEXIST) // FIXME: this shouldn't be logged at LOG_WARNING, but be bubbled up, and logged there to avoid duplicate logging + log_warning_errno(q, "Failed to create destination mount point node '%s', ignoring: %m", + mount_entry_path(m)); + } else + try_again = true; + } + + if (try_again) + r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL); + if (r < 0) + return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); // FIXME: this should not be logged here, but be bubbled up, to avoid duplicate logging + } + + log_debug("Successfully mounted %s to %s", what, mount_entry_path(m)); + return 1; +} + +static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) { + unsigned long new_flags = 0, flags_mask = 0; + bool submounts; + int r; + + assert(m); + assert(proc_self_mountinfo); + + if (m->state != MOUNT_APPLIED) + return 0; + + if (mount_entry_read_only(m) || m->mode == MOUNT_PRIVATE_DEV) { + new_flags |= MS_RDONLY; + flags_mask |= MS_RDONLY; + } + + if (m->nosuid) { + new_flags |= MS_NOSUID; + flags_mask |= MS_NOSUID; + } + + if (flags_mask == 0) /* No Change? */ + return 0; + + /* We generally apply these changes recursively, except for /dev, and the cases we know there's + * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the + * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace + * and running Linux <= 4.17. */ + submounts = + mount_entry_read_only(m) && + !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS); + if (submounts) + r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo); + else + r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo); + + /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked + * read-only already stays this way. This improves compatibility with container managers, where we + * won't attempt to undo read-only mounts already applied. */ + + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m), + submounts ? " and its submounts" : ""); + return 0; +} + +static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) { + unsigned long new_flags = 0, flags_mask = 0; + bool submounts; + int r; + + assert(m); + assert(proc_self_mountinfo); + + if (m->state != MOUNT_APPLIED) + return 0; + + if (mount_entry_noexec(m)) { + new_flags |= MS_NOEXEC; + flags_mask |= MS_NOEXEC; + } else if (mount_entry_exec(m)) { + new_flags &= ~MS_NOEXEC; + flags_mask |= MS_NOEXEC; + } + + if (flags_mask == 0) /* No Change? */ + return 0; + + submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS); + + if (submounts) + r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo); + else + r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo); + + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m), + submounts ? " and its submounts" : ""); + return 0; +} + +static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) { + bool submounts; + int r; + + assert(m); + assert(proc_self_mountinfo); + + if (m->state != MOUNT_APPLIED) + return 0; + + submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS); + if (submounts) + r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo); + else + r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo); + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m), + submounts ? " and its submounts" : ""); + return 0; +} + +static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) { + assert(p); + + /* + * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, + * since to protect the API VFS mounts, they need to be around in the + * first place... + */ + + return p->mount_apivfs || + p->protect_control_groups || + p->protect_kernel_tunables || + p->protect_proc != PROTECT_PROC_DEFAULT || + p->proc_subset != PROC_SUBSET_ALL; +} + +/* Walk all mount entries and dropping any unused mounts. This affects all + * mounts: + * - that are implicitly protected by a path that has been rendered inaccessible + * - whose immediate parent requests the same protection mode as the mount itself + * - that are outside of the relevant root directory + * - which are duplicates + */ +static void drop_unused_mounts(MountList *ml, const char *root_directory) { + assert(ml); + assert(root_directory); + + assert(ml->mounts || ml->n_mounts == 0); + + typesafe_qsort(ml->mounts, ml->n_mounts, mount_path_compare); + + drop_duplicates(ml); + drop_outside_root(ml, root_directory); + drop_inaccessible(ml); + drop_nop(ml); +} + +static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) { + int r; + + STRV_FOREACH_PAIR(src, dst, strv_symlinks) { + _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL; + + src_abs = path_join(root, *src); + dst_abs = path_join(root, *dst); + if (!src_abs || !dst_abs) + return -ENOMEM; + + r = mkdir_parents_label(dst_abs, 0755); + if (r < 0) + return log_debug_errno( + r, + "Failed to create parent directory for symlink '%s': %m", + dst_abs); + + r = symlink_idempotent(src_abs, dst_abs, true); + if (r < 0) + return log_debug_errno( + r, + "Failed to create symlink from '%s' to '%s': %m", + src_abs, + dst_abs); + } + + return 0; +} + +static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) { + assert(m); + + /* Create a string suitable for debugging logs, stripping for example the local working directory. + * For example, with a BindPaths=/var/bar that does not exist on the host: + * + * Before: + * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory + * After: + * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory + * + * Note that this is an error path, so no OOM check is done on purpose. */ + + if (!error_path) + return; + + if (!mount_entry_path(m)) { + *error_path = NULL; + return; + } + + if (root) { + const char *e = startswith(mount_entry_path(m), root); + if (e) { + *error_path = strdup(e); + return; + } + } + + *error_path = strdup(mount_entry_path(m)); + return; +} + +static int apply_mounts( + MountList *ml, + const char *root, + const NamespaceParameters *p, + char **error_path) { + + _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; + _cleanup_free_ char **deny_list = NULL; + int r; + + assert(ml); + assert(root); + assert(p); + + if (ml->n_mounts == 0) /* Shortcut: nothing to do */ + return 0; + + /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of + * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */ + proc_self_mountinfo = fopen("/proc/self/mountinfo", "re"); + if (!proc_self_mountinfo) { + r = -errno; + + if (error_path) + *error_path = strdup("/proc/self/mountinfo"); + + return log_debug_errno(r, "Failed to open /proc/self/mountinfo: %m"); + } + + /* First round, establish all mounts we need */ + for (;;) { + bool again = false; + + FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) { + + if (m->state != MOUNT_PENDING) + continue; + + /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */ + r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) ? root : NULL, m); + if (r < 0) { + mount_entry_path_debug_string(root, m, error_path); + return r; + } + if (r == 0) { + /* We hit a symlinked mount point. The entry got rewritten and might + * point to a very different place now. Let's normalize the changed + * list, and start from the beginning. After all to mount the entry + * at the new location we might need some other mounts first */ + again = true; + break; + } + + /* Returns 1 if the mount should be post-processed, 0 otherwise */ + r = apply_one_mount(root, m, p); + if (r < 0) { + mount_entry_path_debug_string(root, m, error_path); + return r; + } + m->state = r == 0 ? MOUNT_SKIPPED : MOUNT_APPLIED; + } + + if (!again) + break; + + drop_unused_mounts(ml, root); + } + + /* Now that all filesystems have been set up, but before the + * read-only switches are flipped, create the exec dirs and other symlinks. + * Note that when /var/lib is not empty/tmpfs, these symlinks will already + * exist, which means this will be a no-op. */ + r = create_symlinks_from_tuples(root, p->symlinks); + if (r < 0) + return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m"); + + /* Create a deny list we can pass to bind_mount_recursive() */ + deny_list = new(char*, ml->n_mounts+1); + if (!deny_list) + return -ENOMEM; + for (size_t j = 0; j < ml->n_mounts; j++) + deny_list[j] = (char*) mount_entry_path(ml->mounts+j); + deny_list[ml->n_mounts] = NULL; + + /* Second round, flip the ro bits if necessary. */ + FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) { + r = make_read_only(m, deny_list, proc_self_mountinfo); + if (r < 0) { + mount_entry_path_debug_string(root, m, error_path); + return r; + } + } + + /* Third round, flip the noexec bits with a simplified deny list. */ + for (size_t j = 0; j < ml->n_mounts; j++) + if (IN_SET((ml->mounts+j)->mode, MOUNT_EXEC, MOUNT_NOEXEC)) + deny_list[j] = (char*) mount_entry_path(ml->mounts+j); + deny_list[ml->n_mounts] = NULL; + + FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) { + r = make_noexec(m, deny_list, proc_self_mountinfo); + if (r < 0) { + mount_entry_path_debug_string(root, m, error_path); + return r; + } + } + + /* Fourth round, flip the nosuid bits without a deny list. */ + if (p->mount_nosuid) + FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) { + r = make_nosuid(m, proc_self_mountinfo); + if (r < 0) { + mount_entry_path_debug_string(root, m, error_path); + return r; + } + } + + return 1; +} + +static bool root_read_only( + char **read_only_paths, + ProtectSystem protect_system) { + + /* Determine whether the root directory is going to be read-only given the configured settings. */ + + if (protect_system == PROTECT_SYSTEM_STRICT) + return true; + + if (prefixed_path_strv_contains(read_only_paths, "/")) + return true; + + return false; +} + +static bool home_read_only( + char** read_only_paths, + char** inaccessible_paths, + char** empty_directories, + const BindMount *bind_mounts, + size_t n_bind_mounts, + const TemporaryFileSystem *temporary_filesystems, + size_t n_temporary_filesystems, + ProtectHome protect_home) { + + /* Determine whether the /home directory is going to be read-only given the configured settings. Yes, + * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple + * settings. */ + + if (protect_home != PROTECT_HOME_NO) + return true; + + if (prefixed_path_strv_contains(read_only_paths, "/home") || + prefixed_path_strv_contains(inaccessible_paths, "/home") || + prefixed_path_strv_contains(empty_directories, "/home")) + return true; + + for (size_t i = 0; i < n_temporary_filesystems; i++) + if (path_equal(temporary_filesystems[i].path, "/home")) + return true; + + /* If /home is overmounted with some dir from the host it's not writable. */ + for (size_t i = 0; i < n_bind_mounts; i++) + if (path_equal(bind_mounts[i].destination, "/home")) + return true; + + return false; +} + +int setup_namespace(const NamespaceParameters *p, char **error_path) { + + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_strv_free_ char **hierarchies = NULL; + _cleanup_(mount_list_done) MountList ml = {}; + bool require_prefix = false; + const char *root; + DissectImageFlags dissect_image_flags = + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_FSCK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_GROWFS | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES; + int r; + + assert(p); + + /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes + * we configure take effect */ + BLOCK_WITH_UMASK(0000); + + bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir); + unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED; + + if (p->root_image) { + /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */ + if (root_read_only(p->read_only_paths, + p->protect_system) && + home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories, + p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems, + p->protect_home) && + strv_isempty(p->read_write_paths)) + dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; + + SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path); + + r = loop_device_make_by_path( + p->root_image, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &loop_device); + if (r < 0) + return log_debug_errno(r, "Failed to create loop device for root image: %m"); + + r = dissect_loop_device( + loop_device, + p->verity, + p->root_image_options, + p->root_image_policy, + dissect_image_flags, + &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); + + r = dissected_image_load_verity_sig_partition( + dissected_image, + loop_device->fd, + p->verity); + if (r < 0) + return r; + + r = dissected_image_decrypt( + dissected_image, + NULL, + p->verity, + dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + } + + if (p->root_directory) + root = p->root_directory; + else { + /* /run/systemd should have been created by PID 1 early on already, but in some cases, like + * when running tests (test-execute), it might not have been created yet so let's make sure + * we create it if it doesn't already exist. */ + (void) mkdir_p_label("/run/systemd", 0755); + + /* Always create the mount namespace in a temporary directory, instead of operating directly + * in the root. The temporary directory prevents any mounts from being potentially obscured + * my other mounts we already applied. We use the same mount point for all images, which is + * safe, since they all live in their own namespaces after all, and hence won't see each + * other. (Note: this directory is also created by PID 1 early on, we create it here for + * similar reasons as /run/systemd/ first.) */ + root = "/run/systemd/mount-rootfs"; + (void) mkdir_label(root, 0555); + + require_prefix = true; + } + + if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) { + /* Hierarchy population needs to be done for sysext and confext extension images */ + r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES"); + if (r < 0) + return r; + } + + r = append_access_mounts(&ml, p->read_write_paths, MOUNT_READ_WRITE, require_prefix); + if (r < 0) + return r; + + r = append_access_mounts(&ml, p->read_only_paths, MOUNT_READ_ONLY, require_prefix); + if (r < 0) + return r; + + r = append_access_mounts(&ml, p->inaccessible_paths, MOUNT_INACCESSIBLE, require_prefix); + if (r < 0) + return r; + + r = append_access_mounts(&ml, p->exec_paths, MOUNT_EXEC, require_prefix); + if (r < 0) + return r; + + r = append_access_mounts(&ml, p->no_exec_paths, MOUNT_NOEXEC, require_prefix); + if (r < 0) + return r; + + r = append_empty_dir_mounts(&ml, p->empty_directories); + if (r < 0) + return r; + + r = append_bind_mounts(&ml, p->bind_mounts, p->n_bind_mounts); + if (r < 0) + return r; + + r = append_tmpfs_mounts(&ml, p->temporary_filesystems, p->n_temporary_filesystems); + if (r < 0) + return r; + + if (p->tmp_dir) { + bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY); + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/tmp", + .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP, + .source_const = p->tmp_dir, + }; + } + + if (p->var_tmp_dir) { + bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY); + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/var/tmp", + .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP, + .source_const = p->var_tmp_dir, + }; + } + + r = append_mount_images(&ml, p->mount_images, p->n_mount_images); + if (r < 0) + return r; + + r = append_extensions(&ml, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories); + if (r < 0) + return r; + + if (p->private_dev) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/dev", + .mode = MOUNT_PRIVATE_DEV, + .flags = DEV_MOUNT_OPTIONS, + }; + } + + /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective + mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so + let's try the mounts but it's not fatal if they don't succeed. */ + bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID; + if (p->protect_kernel_tunables) { + r = append_static_mounts(&ml, + protect_kernel_tunables_proc_table, + ELEMENTSOF(protect_kernel_tunables_proc_table), + ignore_protect_proc); + if (r < 0) + return r; + + r = append_static_mounts(&ml, + protect_kernel_tunables_sys_table, + ELEMENTSOF(protect_kernel_tunables_sys_table), + p->ignore_protect_paths); + if (r < 0) + return r; + } + + if (p->protect_kernel_modules) { + r = append_static_mounts(&ml, + protect_kernel_modules_table, + ELEMENTSOF(protect_kernel_modules_table), + p->ignore_protect_paths); + if (r < 0) + return r; + } + + if (p->protect_kernel_logs) { + r = append_static_mounts(&ml, + protect_kernel_logs_proc_table, + ELEMENTSOF(protect_kernel_logs_proc_table), + ignore_protect_proc); + if (r < 0) + return r; + + r = append_static_mounts(&ml, + protect_kernel_logs_dev_table, + ELEMENTSOF(protect_kernel_logs_dev_table), + p->ignore_protect_paths); + if (r < 0) + return r; + } + + if (p->protect_control_groups) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/sys/fs/cgroup", + .mode = MOUNT_READ_ONLY, + }; + } + + r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths); + if (r < 0) + return r; + + r = append_protect_system(&ml, p->protect_system, false); + if (r < 0) + return r; + + if (namespace_parameters_mount_apivfs(p)) { + r = append_static_mounts(&ml, + apivfs_table, + ELEMENTSOF(apivfs_table), + p->ignore_protect_paths); + if (r < 0) + return r; + } + + /* Note, if proc is mounted with subset=pid then neither of the two paths will exist, i.e. they are + * implicitly protected by the mount option. */ + if (p->protect_hostname) { + r = append_static_mounts( + &ml, + protect_hostname_table, + ELEMENTSOF(protect_hostname_table), + ignore_protect_proc); + if (r < 0) + return r; + } + + if (p->private_network) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/sys", + .mode = MOUNT_PRIVATE_SYSFS, + }; + } + + if (p->private_ipc) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/dev/mqueue", + .mode = MOUNT_MQUEUEFS, + .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, + }; + } + + if (p->creds_path) { + /* If our service has a credentials store configured, then bind that one in, but hide + * everything else. */ + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/run/credentials", + .mode = MOUNT_TMPFS, + .read_only = true, + .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, + .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC, + }; + + me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = p->creds_path, + .mode = MOUNT_BIND, + .read_only = true, + .source_const = p->creds_path, + .ignore = true, + }; + } else { + /* If our service has no credentials store configured, then make the whole credentials tree + * inaccessible wholesale. */ + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/run/credentials", + .mode = MOUNT_INACCESSIBLE, + .ignore = true, + }; + } + + if (p->log_namespace) { + _cleanup_free_ char *q = NULL; + + q = strjoin("/run/systemd/journal.", p->log_namespace); + if (!q) + return log_oom_debug(); + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/run/systemd/journal", + .mode = MOUNT_BIND_RECURSIVE, + .read_only = true, + .source_malloc = TAKE_PTR(q), + }; + } + + /* Will be used to add bind mounts at runtime */ + if (setup_propagate) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .source_const = p->propagate_dir, + .path_const = p->incoming_dir, + .mode = MOUNT_BIND, + .read_only = true, + }; + } + + if (p->notify_socket) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = p->notify_socket, + .source_const = p->notify_socket, + .mode = MOUNT_BIND, + .read_only = true, + }; + } + + if (p->host_os_release_stage) { + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/run/host/.os-release-stage/", + .source_const = p->host_os_release_stage, + .mode = MOUNT_BIND, + .read_only = true, + .ignore = true, /* Live copy, don't hard-fail if it goes missing */ + }; + } + + /* Prepend the root directory where that's necessary */ + r = prefix_where_needed(&ml, root); + if (r < 0) + return r; + + drop_unused_mounts(&ml, root); + + /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */ + + if (unshare(CLONE_NEWNS) < 0) { + r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m"); + + if (ERRNO_IS_PRIVILEGE(r) || + ERRNO_IS_NOT_SUPPORTED(r)) + /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter + * in place that doesn't allow us to create namespaces (or a missing cap), then + * propagate a recognizable error back, which the caller can use to detect this case + * (and only this) and optionally continue without namespacing applied. */ + return -ENOANO; + + return r; + } + + /* Create the source directory to allow runtime propagation of mounts */ + if (setup_propagate) + (void) mkdir_p(p->propagate_dir, 0600); + + if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) + /* ExtensionImages/Directories mountpoint directories will be created while parsing the + * mounts to create, so have the parent ready */ + (void) mkdir_p(p->extension_dir, 0600); + + /* Remount / as SLAVE so that nothing now mounted in the namespace + * shows up in the parent */ + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m"); + + if (p->root_image) { + /* A root image is specified, mount it to the right place */ + r = dissected_image_mount( + dissected_image, + root, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to mount root image: %m"); + + /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device + * if it likes. */ + r = loop_device_flock(loop_device, LOCK_UN); + if (r < 0) + return log_debug_errno(r, "Failed to release lock on loopback block device: %m"); + + r = dissected_image_relinquish(dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to relinquish dissected image: %m"); + + } else if (p->root_directory) { + + /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */ + r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root); + if (r == 0) { + r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + } + + } else { + /* Let's mount the main root directory to the root directory to use */ + r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + } + + /* Try to set up the new root directory before mounting anything else there. */ + if (p->root_image || p->root_directory) + (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); + + /* Now make the magic happen */ + r = apply_mounts(&ml, root, p, error_path); + if (r < 0) + return r; + + /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ + r = mount_switch_root(root, /* mount_propagation_flag = */ 0); + if (r == -EINVAL && p->root_directory) { + /* If we are using root_directory and we don't have privileges (ie: user manager in a user + * namespace) and the root_directory is already a mount point in the parent namespace, + * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than + * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a + * mount point) and try again. */ + r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + r = mount_switch_root(root, /* mount_propagation_flag = */ 0); + } + if (r < 0) + return log_debug_errno(r, "Failed to mount root with MS_MOVE: %m"); + + /* Remount / as the desired mode. Note that this will not reestablish propagation from our side to + * the host, since what's disconnected is disconnected. */ + if (mount(NULL, "/", NULL, mount_propagation_flag | MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m"); + + /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only supported for + * non-shared mounts. This needs to happen after remounting / or it will fail. */ + if (setup_propagate && mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL) < 0) + return log_debug_errno(errno, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir); + + return 0; +} + +void bind_mount_free_many(BindMount *b, size_t n) { + assert(b || n == 0); + + for (size_t i = 0; i < n; i++) { + free(b[i].source); + free(b[i].destination); + } + + free(b); +} + +int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) { + _cleanup_free_ char *s = NULL, *d = NULL; + BindMount *c; + + assert(b); + assert(n); + assert(item); + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + d = strdup(item->destination); + if (!d) + return -ENOMEM; + + c = reallocarray(*b, *n + 1, sizeof(BindMount)); + if (!c) + return -ENOMEM; + + *b = c; + + c[(*n) ++] = (BindMount) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .read_only = item->read_only, + .nosuid = item->nosuid, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + + return 0; +} + +MountImage* mount_image_free_many(MountImage *m, size_t *n) { + assert(n); + assert(m || *n == 0); + + for (size_t i = 0; i < *n; i++) { + free(m[i].source); + free(m[i].destination); + mount_options_free_all(m[i].mount_options); + } + + free(m); + *n = 0; + return NULL; +} + +int mount_image_add(MountImage **m, size_t *n, const MountImage *item) { + _cleanup_free_ char *s = NULL, *d = NULL; + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + MountImage *c; + + assert(m); + assert(n); + assert(item); + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + if (item->destination) { + d = strdup(item->destination); + if (!d) + return -ENOMEM; + } + + LIST_FOREACH(mount_options, i, item->mount_options) { + _cleanup_(mount_options_free_allp) MountOptions *o = NULL; + + o = new(MountOptions, 1); + if (!o) + return -ENOMEM; + + *o = (MountOptions) { + .partition_designator = i->partition_designator, + .options = strdup(i->options), + }; + if (!o->options) + return -ENOMEM; + + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + + c = reallocarray(*m, *n + 1, sizeof(MountImage)); + if (!c) + return -ENOMEM; + + *m = c; + + c[(*n) ++] = (MountImage) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .mount_options = TAKE_PTR(options), + .ignore_enoent = item->ignore_enoent, + .type = item->type, + }; + + return 0; +} + +void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) { + assert(t || n == 0); + + for (size_t i = 0; i < n; i++) { + free(t[i].path); + free(t[i].options); + } + + free(t); +} + +int temporary_filesystem_add( + TemporaryFileSystem **t, + size_t *n, + const char *path, + const char *options) { + + _cleanup_free_ char *p = NULL, *o = NULL; + TemporaryFileSystem *c; + + assert(t); + assert(n); + assert(path); + + p = strdup(path); + if (!p) + return -ENOMEM; + + if (!isempty(options)) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem)); + if (!c) + return -ENOMEM; + + *t = c; + + c[(*n) ++] = (TemporaryFileSystem) { + .path = TAKE_PTR(p), + .options = TAKE_PTR(o), + }; + + return 0; +} + +static int make_tmp_prefix(const char *prefix) { + _cleanup_free_ char *t = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + /* Don't do anything unless we know the dir is actually missing */ + r = access(prefix, F_OK); + if (r >= 0) + return 0; + if (errno != ENOENT) + return -errno; + + WITH_UMASK(000) + r = mkdir_parents(prefix, 0755); + if (r < 0) + return r; + + r = tempfn_random(prefix, NULL, &t); + if (r < 0) + return r; + + /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for + * the suid bit, below. */ + fd = open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0777); + if (fd < 0) + return fd; + + r = RET_NERRNO(fchmod(fd, 01777)); + if (r < 0) { + (void) rmdir(t); + return r; + } + + r = RET_NERRNO(rename(t, prefix)); + if (r < 0) { + (void) rmdir(t); + return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */ + } + + return 0; + +} + +static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) { + _cleanup_free_ char *x = NULL; + _cleanup_free_ char *y = NULL; + sd_id128_t boot_id; + bool rw = true; + int r; + + assert(id); + assert(prefix); + assert(path); + + /* We include the boot id in the directory so that after a + * reboot we can easily identify obsolete directories. */ + + r = sd_id128_get_boot(&boot_id); + if (r < 0) + return r; + + x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX"); + if (!x) + return -ENOMEM; + + r = make_tmp_prefix(prefix); + if (r < 0) + return r; + + WITH_UMASK(0077) + if (!mkdtemp(x)) { + if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno)) + rw = false; + else + return -errno; + } + + if (rw) { + y = strjoin(x, "/tmp"); + if (!y) + return -ENOMEM; + + WITH_UMASK(0000) + if (mkdir(y, 0777 | S_ISVTX) < 0) + return -errno; + + r = label_fix_full(AT_FDCWD, y, prefix, 0); + if (r < 0) + return r; + + if (tmp_path) + *tmp_path = TAKE_PTR(y); + } else { + /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being + * read-only. This way the service will get the EROFS result as if it was writing to the real + * file system. */ + WITH_UMASK(0000) + r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500); + if (r < 0) + return r; + + r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY); + if (r < 0) + return r; + } + + *path = TAKE_PTR(x); + return 0; +} + +int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { + _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL; + _cleanup_(rmdir_and_freep) char *a_tmp = NULL; + char *b; + int r; + + assert(id); + assert(tmp_dir); + assert(var_tmp_dir); + + r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp); + if (r < 0) + return r; + + r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL); + if (r < 0) + return r; + + a_tmp = mfree(a_tmp); /* avoid rmdir */ + *tmp_dir = TAKE_PTR(a); + *var_tmp_dir = TAKE_PTR(b); + + return 0; +} + +int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) { + _cleanup_close_ int ns = -EBADF; + const char *ns_name, *ns_path; + int r; + + assert(ns_storage_socket); + assert(ns_storage_socket[0] >= 0); + assert(ns_storage_socket[1] >= 0); + + ns_name = ASSERT_PTR(namespace_single_flag_to_string(nsflag)); + + /* We use the passed socketpair as a storage buffer for our namespace reference fd. Whatever process + * runs this first shall create a new namespace, all others should just join it. To serialize that we + * use a file lock on the socket pair. + * + * It's a bit crazy, but hey, works great! */ + + r = posix_lock(ns_storage_socket[0], LOCK_EX); + if (r < 0) + return r; + + CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]); + + ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT); + if (ns >= 0) { + /* Yay, found something, so let's join the namespace */ + r = RET_NERRNO(setns(ns, nsflag)); + if (r < 0) + return r; + + return 0; + } + + if (ns != -EAGAIN) + return ns; + + /* Nothing stored yet, so let's create a new namespace. */ + + if (unshare(nsflag) < 0) + return -errno; + + if (nsflag == CLONE_NEWNET) + (void) loopback_setup(); + + ns_path = strjoina("/proc/self/ns/", ns_name); + ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (ns < 0) + return -errno; + + r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT); + if (r < 0) + return r; + + return 1; +} + +int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) { + _cleanup_close_ int ns = -EBADF; + int r; + + assert(ns_storage_socket); + assert(ns_storage_socket[0] >= 0); + assert(ns_storage_socket[1] >= 0); + assert(path); + + /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in + * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will + * allocate a new anonymous ns if needed. */ + + r = posix_lock(ns_storage_socket[0], LOCK_EX); + if (r < 0) + return r; + + CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]); + + ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT); + if (ns >= 0) + return 0; + if (ns != -EAGAIN) + return ns; + + /* Nothing stored yet. Open the file from the file system. */ + + ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (ns < 0) + return -errno; + + r = fd_is_ns(ns, nsflag); + if (r == 0) + return -EINVAL; + if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */ + return r; + + r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT); + if (r < 0) + return r; + + return 1; +} + +bool ns_type_supported(NamespaceType type) { + const char *t, *ns_proc; + + t = namespace_type_to_string(type); + if (!t) /* Don't know how to translate this? Then it's not supported */ + return false; + + ns_proc = strjoina("/proc/self/ns/", t); + return access(ns_proc, F_OK) == 0; +} + +static const char *const protect_home_table[_PROTECT_HOME_MAX] = { + [PROTECT_HOME_NO] = "no", + [PROTECT_HOME_YES] = "yes", + [PROTECT_HOME_READ_ONLY] = "read-only", + [PROTECT_HOME_TMPFS] = "tmpfs", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES); + +static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { + [PROTECT_SYSTEM_NO] = "no", + [PROTECT_SYSTEM_YES] = "yes", + [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES); + +static const char* const namespace_type_table[] = { + [NAMESPACE_MOUNT] = "mnt", + [NAMESPACE_CGROUP] = "cgroup", + [NAMESPACE_UTS] = "uts", + [NAMESPACE_IPC] = "ipc", + [NAMESPACE_USER] = "user", + [NAMESPACE_PID] = "pid", + [NAMESPACE_NET] = "net", + [NAMESPACE_TIME] = "time", +}; + +DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType); + +static const char* const protect_proc_table[_PROTECT_PROC_MAX] = { + [PROTECT_PROC_DEFAULT] = "default", + [PROTECT_PROC_NOACCESS] = "noaccess", + [PROTECT_PROC_INVISIBLE] = "invisible", + [PROTECT_PROC_PTRACEABLE] = "ptraceable", +}; + +DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc); + +static const char* const proc_subset_table[_PROC_SUBSET_MAX] = { + [PROC_SUBSET_ALL] = "all", + [PROC_SUBSET_PID] = "pid", +}; + +DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset); diff --git a/src/core/namespace.h b/src/core/namespace.h new file mode 100644 index 0000000..921716b --- /dev/null +++ b/src/core/namespace.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2016 Djalal Harouni +***/ + +typedef struct NamespaceParameters NamespaceParameters; +typedef struct BindMount BindMount; +typedef struct TemporaryFileSystem TemporaryFileSystem; +typedef struct MountImage MountImage; + +#include + +#include "dissect-image.h" +#include "fs-util.h" +#include "macro.h" +#include "namespace-util.h" +#include "runtime-scope.h" +#include "string-util.h" + +typedef enum ProtectHome { + PROTECT_HOME_NO, + PROTECT_HOME_YES, + PROTECT_HOME_READ_ONLY, + PROTECT_HOME_TMPFS, + _PROTECT_HOME_MAX, + _PROTECT_HOME_INVALID = -EINVAL, +} ProtectHome; + +typedef enum ProtectSystem { + PROTECT_SYSTEM_NO, + PROTECT_SYSTEM_YES, + PROTECT_SYSTEM_FULL, + PROTECT_SYSTEM_STRICT, + _PROTECT_SYSTEM_MAX, + _PROTECT_SYSTEM_INVALID = -EINVAL, +} ProtectSystem; + +typedef enum ProtectProc { + PROTECT_PROC_DEFAULT, + PROTECT_PROC_NOACCESS, /* hidepid=noaccess */ + PROTECT_PROC_INVISIBLE, /* hidepid=invisible */ + PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */ + _PROTECT_PROC_MAX, + _PROTECT_PROC_INVALID = -EINVAL, +} ProtectProc; + +typedef enum ProcSubset { + PROC_SUBSET_ALL, + PROC_SUBSET_PID, /* subset=pid */ + _PROC_SUBSET_MAX, + _PROC_SUBSET_INVALID = -EINVAL, +} ProcSubset; + +struct BindMount { + char *source; + char *destination; + bool read_only; + bool nosuid; + bool recursive; + bool ignore_enoent; +}; + +struct TemporaryFileSystem { + char *path; + char *options; +}; + +typedef enum MountImageType { + MOUNT_IMAGE_DISCRETE, + MOUNT_IMAGE_EXTENSION, + _MOUNT_IMAGE_TYPE_MAX, + _MOUNT_IMAGE_TYPE_INVALID = -EINVAL, +} MountImageType; + +struct MountImage { + char *source; + char *destination; /* Unused if MountImageType == MOUNT_IMAGE_EXTENSION */ + LIST_HEAD(MountOptions, mount_options); + bool ignore_enoent; + MountImageType type; +}; + +struct NamespaceParameters { + RuntimeScope runtime_scope; + + const char *root_directory; + const char *root_image; + const MountOptions *root_image_options; + const ImagePolicy *root_image_policy; + + char **read_write_paths; + char **read_only_paths; + char **inaccessible_paths; + + char **exec_paths; + char **no_exec_paths; + + char **empty_directories; + char **symlinks; + + const BindMount *bind_mounts; + size_t n_bind_mounts; + + const TemporaryFileSystem *temporary_filesystems; + size_t n_temporary_filesystems; + + const MountImage *mount_images; + size_t n_mount_images; + const ImagePolicy *mount_image_policy; + + const char *tmp_dir; + const char *var_tmp_dir; + + const char *creds_path; + const char *log_namespace; + + unsigned long mount_propagation_flag; + VeritySettings *verity; + + const MountImage *extension_images; + size_t n_extension_images; + const ImagePolicy *extension_image_policy; + char **extension_directories; + + const char *propagate_dir; + const char *incoming_dir; + + const char *extension_dir; + const char *notify_socket; + const char *host_os_release_stage; + + bool ignore_protect_paths; + + bool protect_control_groups; + bool protect_kernel_tunables; + bool protect_kernel_modules; + bool protect_kernel_logs; + bool protect_hostname; + + bool private_dev; + bool private_network; + bool private_ipc; + + bool mount_apivfs; + bool mount_nosuid; + + ProtectHome protect_home; + ProtectSystem protect_system; + ProtectProc protect_proc; + ProcSubset proc_subset; +}; + +int setup_namespace(const NamespaceParameters *p, char **error_path); + +#define RUN_SYSTEMD_EMPTY "/run/systemd/empty" + +static inline char* namespace_cleanup_tmpdir(char *p) { + PROTECT_ERRNO; + if (!streq_ptr(p, RUN_SYSTEMD_EMPTY)) + (void) rmdir(p); + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, namespace_cleanup_tmpdir); + +int setup_tmp_dirs( + const char *id, + char **tmp_dir, + char **var_tmp_dir); + +int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag); +int open_shareable_ns_path(int netns_storage_socket[static 2], const char *path, unsigned long nsflag); + +const char* protect_home_to_string(ProtectHome p) _const_; +ProtectHome protect_home_from_string(const char *s) _pure_; + +const char* protect_system_to_string(ProtectSystem p) _const_; +ProtectSystem protect_system_from_string(const char *s) _pure_; + +const char* protect_proc_to_string(ProtectProc i) _const_; +ProtectProc protect_proc_from_string(const char *s) _pure_; + +const char* proc_subset_to_string(ProcSubset i) _const_; +ProcSubset proc_subset_from_string(const char *s) _pure_; + +void bind_mount_free_many(BindMount *b, size_t n); +int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); + +void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n); +int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n, + const char *path, const char *options); + +MountImage* mount_image_free_many(MountImage *m, size_t *n); +int mount_image_add(MountImage **m, size_t *n, const MountImage *item); + +const char* namespace_type_to_string(NamespaceType t) _const_; +NamespaceType namespace_type_from_string(const char *s) _pure_; + +bool ns_type_supported(NamespaceType type); diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf new file mode 100644 index 0000000..52034e0 --- /dev/null +++ b/src/core/org.freedesktop.systemd1.conf @@ -0,0 +1,452 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/core/org.freedesktop.systemd1.policy.in b/src/core/org.freedesktop.systemd1.policy.in new file mode 100644 index 0000000..0083e0b --- /dev/null +++ b/src/core/org.freedesktop.systemd1.policy.in @@ -0,0 +1,83 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Send passphrase back to system + Authentication is required to send the entered passphrase back to the system. + + no + no + auth_admin_keep + + {{LIBEXECDIR}}/systemd-reply-password + + + + Manage system services or other units + Authentication is required to manage system services or other units. + + auth_admin + auth_admin + auth_admin_keep + + + + + Manage system service or unit files + Authentication is required to manage system service or unit files. + + auth_admin + auth_admin + auth_admin_keep + + org.freedesktop.systemd1.reload-daemon org.freedesktop.systemd1.manage-units + + + + Set or unset system and service manager environment variables + Authentication is required to set or unset system and service manager environment variables. + + auth_admin + auth_admin + auth_admin_keep + + + + + Reload the systemd state + Authentication is required to reload the systemd state. + + auth_admin + auth_admin + auth_admin_keep + + + + + Dump the systemd state without rate limits + Authentication is required to dump the systemd state without rate limits. + + auth_admin + auth_admin + auth_admin_keep + + + + diff --git a/src/core/org.freedesktop.systemd1.service b/src/core/org.freedesktop.systemd1.service new file mode 100644 index 0000000..082125f --- /dev/null +++ b/src/core/org.freedesktop.systemd1.service @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.systemd1 +Exec=/bin/false +User=root diff --git a/src/core/path.c b/src/core/path.c new file mode 100644 index 0000000..ef00c20 --- /dev/null +++ b/src/core/path.c @@ -0,0 +1,1075 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-path.h" +#include "dbus-unit.h" +#include "escape.h" +#include "event-util.h" +#include "fd-util.h" +#include "glob-util.h" +#include "inotify-util.h" +#include "macro.h" +#include "mkdir-label.h" +#include "path.h" +#include "path-util.h" +#include "serialize.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_PATH_STATE_MAX] = { + [PATH_DEAD] = UNIT_INACTIVE, + [PATH_WAITING] = UNIT_ACTIVE, + [PATH_RUNNING] = UNIT_ACTIVE, + [PATH_FAILED] = UNIT_FAILED, +}; + +static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); + +int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) { + static const int flags_table[_PATH_TYPE_MAX] = { + [PATH_EXISTS] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB, + [PATH_EXISTS_GLOB] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB, + [PATH_CHANGED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO, + [PATH_MODIFIED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO|IN_MODIFY, + [PATH_DIRECTORY_NOT_EMPTY] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO, + }; + + bool exists = false; + char *slash, *oldslash = NULL; + int r; + + assert(s); + assert(s->unit); + assert(handler); + + path_spec_unwatch(s); + + s->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (s->inotify_fd < 0) { + r = log_error_errno(errno, "Failed to allocate inotify fd: %m"); + goto fail; + } + + r = sd_event_add_io(s->unit->manager->event, &s->event_source, s->inotify_fd, EPOLLIN, handler, s); + if (r < 0) { + log_error_errno(r, "Failed to add inotify fd to event loop: %m"); + goto fail; + } + + (void) sd_event_source_set_description(s->event_source, "path"); + + /* This function assumes the path was passed through path_simplify()! */ + assert(!strstr(s->path, "//")); + + for (slash = strchr(s->path, '/'); ; slash = strchr(slash+1, '/')) { + bool incomplete = false; + int flags, wd = -1; + char tmp, *cut; + + if (slash) { + cut = slash + (slash == s->path); + tmp = *cut; + *cut = '\0'; + + flags = IN_MOVE_SELF | IN_DELETE_SELF | IN_ATTRIB | IN_CREATE | IN_MOVED_TO; + } else { + cut = NULL; + flags = flags_table[s->type]; + } + + /* If this is a symlink watch both the symlink inode and where it points to. If the inode is + * not a symlink both calls will install the same watch, which is redundant and doesn't + * hurt. */ + for (int follow_symlink = 0; follow_symlink < 2; follow_symlink ++) { + uint32_t f = flags; + + SET_FLAG(f, IN_DONT_FOLLOW, !follow_symlink); + + wd = inotify_add_watch(s->inotify_fd, s->path, f); + if (wd < 0) { + if (IN_SET(errno, EACCES, ENOENT)) { + incomplete = true; /* This is an expected error, let's accept this + * quietly: we have an incomplete watch for + * now. */ + break; + } + + /* This second call to inotify_add_watch() should fail like the previous one + * and is done for logging the error in a comprehensive way. */ + wd = inotify_add_watch_and_warn(s->inotify_fd, s->path, f); + if (wd < 0) { + if (cut) + *cut = tmp; + + r = wd; + goto fail; + } + + /* Hmm, we succeeded in adding the watch this time... let's continue. */ + } + } + + if (incomplete) { + if (cut) + *cut = tmp; + + break; + } + + exists = true; + + /* Path exists, we don't need to watch parent too closely. */ + if (oldslash) { + char *cut2 = oldslash + (oldslash == s->path); + char tmp2 = *cut2; + *cut2 = '\0'; + + (void) inotify_add_watch(s->inotify_fd, s->path, IN_MOVE_SELF); + /* Error is ignored, the worst can happen is we get spurious events. */ + + *cut2 = tmp2; + } + + if (cut) + *cut = tmp; + + if (slash) + oldslash = slash; + else { + /* whole path has been iterated over */ + s->primary_wd = wd; + break; + } + } + + if (!exists) { + r = log_error_errno(errno, "Failed to add watch on any of the components of %s: %m", s->path); + /* either EACCESS or ENOENT */ + goto fail; + } + + return 0; + +fail: + path_spec_unwatch(s); + return r; +} + +void path_spec_unwatch(PathSpec *s) { + assert(s); + + s->event_source = sd_event_source_disable_unref(s->event_source); + s->inotify_fd = safe_close(s->inotify_fd); +} + +int path_spec_fd_event(PathSpec *s, uint32_t revents) { + union inotify_event_buffer buffer; + ssize_t l; + + assert(s); + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Got invalid poll event on inotify."); + + l = read(s->inotify_fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return log_error_errno(errno, "Failed to read inotify event: %m"); + } + + if (IN_SET(s->type, PATH_CHANGED, PATH_MODIFIED)) + FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) + if (s->primary_wd == e->wd) + return 1; + + return 0; +} + +static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_notify, char **ret_trigger_path) { + _cleanup_free_ char *trigger = NULL; + bool b, good = false; + + assert(s); + assert(ret_trigger_path); + + switch (s->type) { + + case PATH_EXISTS: + good = access(s->path, F_OK) >= 0; + break; + + case PATH_EXISTS_GLOB: + good = glob_first(s->path, &trigger) > 0; + break; + + case PATH_DIRECTORY_NOT_EMPTY: { + int k; + + k = dir_is_empty(s->path, /* ignore_hidden_or_backup= */ true); + good = !(IN_SET(k, -ENOENT, -ENOTDIR) || k > 0); + break; + } + + case PATH_CHANGED: + case PATH_MODIFIED: + b = access(s->path, F_OK) >= 0; + good = !initial && !from_trigger_notify && b != s->previous_exists; + s->previous_exists = b; + break; + + default: + ; + } + + if (good) { + if (!trigger) { + trigger = strdup(s->path); + if (!trigger) + (void) log_oom_debug(); + } + *ret_trigger_path = TAKE_PTR(trigger); + } + + return good; +} + +static void path_spec_mkdir(PathSpec *s, mode_t mode) { + int r; + + if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB)) + return; + + r = mkdir_p_label(s->path, mode); + if (r < 0) + log_warning_errno(r, "mkdir(%s) failed: %m", s->path); +} + +static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) { + const char *type; + + assert_se(type = path_type_to_string(s->type)); + fprintf(f, "%s%s: %s\n", prefix, type, s->path); +} + +void path_spec_done(PathSpec *s) { + assert(s); + assert(s->inotify_fd == -EBADF); + + free(s->path); +} + +static void path_init(Unit *u) { + Path *p = PATH(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + p->directory_mode = 0755; + + p->trigger_limit = RATELIMIT_OFF; +} + +void path_free_specs(Path *p) { + PathSpec *s; + + assert(p); + + while ((s = LIST_POP(spec, p->specs))) { + path_spec_unwatch(s); + path_spec_done(s); + free(s); + } +} + +static void path_done(Unit *u) { + Path *p = PATH(u); + + assert(p); + + p->trigger_notify_event_source = sd_event_source_disable_unref(p->trigger_notify_event_source); + path_free_specs(p); +} + +static int path_add_mount_dependencies(Path *p) { + int r; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + return 0; +} + +static int path_verify(Path *p) { + assert(p); + assert(UNIT(p)->load_state == UNIT_LOADED); + + if (!p->specs) + return log_unit_error_errno(UNIT(p), SYNTHETIC_ERRNO(ENOEXEC), "Path unit lacks path setting. Refusing."); + + return 0; +} + +static int path_add_default_dependencies(Path *p) { + int r; + + assert(p); + + if (!UNIT(p)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(p), UNIT_BEFORE, SPECIAL_PATHS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(p)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(p), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(p), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int path_add_trigger_dependencies(Path *p) { + Unit *x; + int r; + + assert(p); + + if (UNIT_TRIGGER(UNIT(p))) + return 0; + + r = unit_load_related_unit(UNIT(p), ".service", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(p), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int path_add_extras(Path *p) { + int r; + + assert(p); + + /* To avoid getting pid1 in a busy-loop state (eg: unmet condition on associated service), + * set a default trigger limit if the user didn't specify any. */ + if (p->trigger_limit.interval == USEC_INFINITY) + p->trigger_limit.interval = 2 * USEC_PER_SEC; + + if (p->trigger_limit.burst == UINT_MAX) + p->trigger_limit.burst = 200; + + r = path_add_trigger_dependencies(p); + if (r < 0) + return r; + + r = path_add_mount_dependencies(p); + if (r < 0) + return r; + + return path_add_default_dependencies(p); +} + +static int path_load(Unit *u) { + Path *p = PATH(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + r = path_add_extras(p); + if (r < 0) + return r; + + return path_verify(p); +} + +static void path_dump(Unit *u, FILE *f, const char *prefix) { + Path *p = PATH(u); + Unit *trigger; + + assert(p); + assert(f); + + trigger = UNIT_TRIGGER(u); + + fprintf(f, + "%sPath State: %s\n" + "%sResult: %s\n" + "%sUnit: %s\n" + "%sMakeDirectory: %s\n" + "%sDirectoryMode: %04o\n" + "%sTriggerLimitIntervalSec: %s\n" + "%sTriggerLimitBurst: %u\n", + prefix, path_state_to_string(p->state), + prefix, path_result_to_string(p->result), + prefix, trigger ? trigger->id : "n/a", + prefix, yes_no(p->make_directory), + prefix, p->directory_mode, + prefix, FORMAT_TIMESPAN(p->trigger_limit.interval, USEC_PER_SEC), + prefix, p->trigger_limit.burst); + + LIST_FOREACH(spec, s, p->specs) + path_spec_dump(s, f, prefix); +} + +static void path_unwatch(Path *p) { + assert(p); + + LIST_FOREACH(spec, s, p->specs) + path_spec_unwatch(s); +} + +static int path_watch(Path *p) { + int r; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + r = path_spec_watch(s, path_dispatch_io); + if (r < 0) + return r; + } + + return 0; +} + +static void path_set_state(Path *p, PathState state) { + PathState old_state; + assert(p); + + if (p->state != state) + bus_unit_send_pending_change_signal(UNIT(p), false); + + old_state = p->state; + p->state = state; + + if (!IN_SET(state, PATH_WAITING, PATH_RUNNING)) + path_unwatch(p); + + if (state != old_state) + log_unit_debug(UNIT(p), "Changed %s -> %s", path_state_to_string(old_state), path_state_to_string(state)); + + unit_notify(UNIT(p), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify); + +static int path_coldplug(Unit *u) { + Path *p = PATH(u); + + assert(p); + assert(p->state == PATH_DEAD); + + if (p->deserialized_state != p->state) { + + if (IN_SET(p->deserialized_state, PATH_WAITING, PATH_RUNNING)) + path_enter_waiting(p, true, false); + else + path_set_state(p, p->deserialized_state); + } + + return 0; +} + +static void path_enter_dead(Path *p, PathResult f) { + assert(p); + + if (p->result == PATH_SUCCESS) + p->result = f; + + unit_log_result(UNIT(p), p->result == PATH_SUCCESS, path_result_to_string(p->result)); + path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD); +} + +static void path_enter_running(Path *p, char *trigger_path) { + _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + Job *job; + int r; + + assert(p); + + /* Don't start job if we are supposed to go down */ + if (unit_stop_pending(UNIT(p))) + return; + + if (!ratelimit_below(&p->trigger_limit)) { + log_unit_warning(UNIT(p), "Trigger limit hit, refusing further activation."); + path_enter_dead(p, PATH_FAILURE_TRIGGER_LIMIT_HIT); + return; + } + + trigger = UNIT_TRIGGER(UNIT(p)); + if (!trigger) { + log_unit_error(UNIT(p), "Unit to trigger vanished."); + goto fail; + } + + details = activation_details_new(UNIT(p)); + if (!details) { + log_oom(); + goto fail; + } + + r = free_and_strdup(&(ACTIVATION_DETAILS_PATH(details))->trigger_path_filename, trigger_path); + if (r < 0) { + log_oom(); + goto fail; + } + + r = manager_add_job(UNIT(p)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job); + if (r < 0) { + log_unit_warning(UNIT(p), "Failed to queue unit startup job: %s", bus_error_message(&error, r)); + goto fail; + } + + job_set_activation_details(job, details); + + path_set_state(p, PATH_RUNNING); + path_unwatch(p); + + return; + +fail: + path_enter_dead(p, PATH_FAILURE_RESOURCES); +} + +static bool path_check_good(Path *p, bool initial, bool from_trigger_notify, char **ret_trigger_path) { + assert(p); + assert(ret_trigger_path); + + LIST_FOREACH(spec, s, p->specs) + if (path_spec_check_good(s, initial, from_trigger_notify, ret_trigger_path)) + return true; + + return false; +} + +static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify) { + _cleanup_free_ char *trigger_path = NULL; + Unit *trigger; + int r; + + if (p->trigger_notify_event_source) + (void) event_source_disable(p->trigger_notify_event_source); + + /* If the triggered unit is already running, so are we */ + trigger = UNIT_TRIGGER(UNIT(p)); + if (trigger && !UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(trigger))) { + path_set_state(p, PATH_RUNNING); + path_unwatch(p); + return; + } + + if (path_check_good(p, initial, from_trigger_notify, &trigger_path)) { + log_unit_debug(UNIT(p), "Got triggered."); + path_enter_running(p, trigger_path); + return; + } + + r = path_watch(p); + if (r < 0) { + log_unit_warning_errno(UNIT(p), r, "Failed to enter waiting state: %m"); + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return; + } + + /* Hmm, so now we have created inotify watches, but the file + * might have appeared/been removed by now, so we must + * recheck */ + + if (path_check_good(p, false, from_trigger_notify, &trigger_path)) { + log_unit_debug(UNIT(p), "Got triggered."); + path_enter_running(p, trigger_path); + return; + } + + path_set_state(p, PATH_WAITING); +} + +static void path_mkdir(Path *p) { + assert(p); + + if (!p->make_directory) + return; + + LIST_FOREACH(spec, s, p->specs) + path_spec_mkdir(s, p->directory_mode); +} + +static int path_start(Unit *u) { + Path *p = PATH(u); + int r; + + assert(p); + assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED)); + + r = unit_test_trigger_loaded(u); + if (r < 0) + return r; + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + path_mkdir(p); + + p->result = PATH_SUCCESS; + path_enter_waiting(p, true, false); + + return 1; +} + +static int path_stop(Unit *u) { + Path *p = PATH(u); + + assert(p); + assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING)); + + path_enter_dead(p, PATH_SUCCESS); + return 1; +} + +static int path_serialize(Unit *u, FILE *f, FDSet *fds) { + Path *p = PATH(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", path_state_to_string(p->state)); + (void) serialize_item(f, "result", path_result_to_string(p->result)); + + LIST_FOREACH(spec, s, p->specs) { + const char *type; + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->path); + if (!escaped) + return log_oom(); + + assert_se(type = path_type_to_string(s->type)); + (void) serialize_item_format(f, "path-spec", "%s %i %s", + type, + s->previous_exists, + escaped); + } + + (void) serialize_ratelimit(f, "trigger-ratelimit", &p->trigger_limit); + + return 0; +} + +static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Path *p = PATH(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + PathState state; + + state = path_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + p->deserialized_state = state; + + } else if (streq(key, "result")) { + PathResult f; + + f = path_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != PATH_SUCCESS) + p->result = f; + + } else if (streq(key, "path-spec")) { + int previous_exists, skip = 0; + _cleanup_free_ char *type_str = NULL; + + if (sscanf(value, "%ms %i %n", &type_str, &previous_exists, &skip) < 2) + log_unit_debug(u, "Failed to parse path-spec value: %s", value); + else { + _cleanup_free_ char *unescaped = NULL; + ssize_t l; + PathType type; + + type = path_type_from_string(type_str); + if (type < 0) { + log_unit_warning(u, "Unknown path type \"%s\", ignoring.", type_str); + return 0; + } + + l = cunescape(value+skip, 0, &unescaped); + if (l < 0) { + log_unit_warning_errno(u, l, "Failed to unescape serialize path: %m"); + return 0; + } + + LIST_FOREACH(spec, s, p->specs) + if (s->type == type && + path_equal(s->path, unescaped)) { + + s->previous_exists = previous_exists; + break; + } + } + + } else if (streq(key, "trigger-ratelimit")) + deserialize_ratelimit(&p->trigger_limit, key, value); + + else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState path_active_state(Unit *u) { + assert(u); + + return state_translation_table[PATH(u)->state]; +} + +static const char *path_sub_state_to_string(Unit *u) { + assert(u); + + return path_state_to_string(PATH(u)->state); +} + +static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + PathSpec *s = userdata, *found = NULL; + Path *p; + int changed; + + assert(s); + assert(s->unit); + assert(fd >= 0); + + p = PATH(s->unit); + + if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING)) + return 0; + + LIST_FOREACH(spec, i, p->specs) + if (path_spec_owns_inotify_fd(i, fd)) { + found = i; + break; + } + + if (!found) { + log_error("Got event on unknown fd."); + goto fail; + } + + changed = path_spec_fd_event(found, revents); + if (changed < 0) + goto fail; + + if (changed) + path_enter_running(p, found->path); + else + path_enter_waiting(p, false, false); + + return 0; + +fail: + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return 0; +} + +static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer); + +static int path_trigger_notify_on_defer(sd_event_source *s, void *userdata) { + Path *p = ASSERT_PTR(userdata); + Unit *trigger; + + assert(s); + + trigger = UNIT_TRIGGER(UNIT(p)); + if (!trigger) { + log_unit_error(UNIT(p), "Unit to trigger vanished."); + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return 0; + } + + path_trigger_notify_impl(UNIT(p), trigger, /* on_defer = */ true); + return 0; +} + +static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer) { + Path *p = PATH(u); + int r; + + assert(u); + assert(other); + + /* Invoked whenever the unit we trigger changes state or gains or loses a job */ + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + + /* Don't propagate state changes from the triggered unit if we are already down */ + if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING)) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + path_enter_dead(p, PATH_FAILURE_UNIT_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + if (p->state == PATH_RUNNING && + UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) { + if (!on_defer) + log_unit_debug(u, "Got notified about unit deactivation."); + } else if (p->state == PATH_WAITING && + !UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) { + if (!on_defer) + log_unit_debug(u, "Got notified about unit activation."); + } else + return; + + if (on_defer) { + path_enter_waiting(p, /* initial = */ false, /* from_trigger_notify = */ true); + return; + } + + /* Do not call path_enter_waiting() directly from path_trigger_notify(), as this may be called by + * job_install() -> job_finish_and_invalidate() -> unit_trigger_notify(), and path_enter_waiting() + * may install another job and will trigger assertion in job_install(). + * https://github.com/systemd/systemd/issues/24577#issuecomment-1522628906 + * Hence, first setup defer event source here, and call path_enter_waiting() slightly later. */ + if (p->trigger_notify_event_source) { + r = sd_event_source_set_enabled(p->trigger_notify_event_source, SD_EVENT_ONESHOT); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to enable event source for triggering notify: %m"); + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return; + } + } else { + r = sd_event_add_defer(u->manager->event, &p->trigger_notify_event_source, path_trigger_notify_on_defer, p); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to allocate event source for triggering notify: %m"); + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return; + } + + (void) sd_event_source_set_description(p->trigger_notify_event_source, "path-trigger-notify"); + } +} + +static void path_trigger_notify(Unit *u, Unit *other) { + path_trigger_notify_impl(u, other, /* on_defer = */ false); +} + +static void path_reset_failed(Unit *u) { + Path *p = PATH(u); + + assert(p); + + if (p->state == PATH_FAILED) + path_set_state(p, PATH_DEAD); + + p->result = PATH_SUCCESS; +} + +static int path_can_start(Unit *u) { + Path *p = PATH(u); + int r; + + assert(p); + + r = unit_test_start_limit(u); + if (r < 0) { + path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT); + return r; + } + + return 1; +} + +static void activation_details_path_done(ActivationDetails *details) { + ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details)); + + p->trigger_path_filename = mfree(p->trigger_path_filename); +} + +static void activation_details_path_serialize(ActivationDetails *details, FILE *f) { + ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details)); + + assert(f); + + if (p->trigger_path_filename) + (void) serialize_item(f, "activation-details-path-filename", p->trigger_path_filename); +} + +static int activation_details_path_deserialize(const char *key, const char *value, ActivationDetails **details) { + int r; + + assert(key); + assert(value); + + if (!details || !*details) + return -EINVAL; + + ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(*details); + if (!p) + return -EINVAL; + + if (!streq(key, "activation-details-path-filename")) + return -EINVAL; + + r = free_and_strdup(&p->trigger_path_filename, value); + if (r < 0) + return r; + + return 0; +} + +static int activation_details_path_append_env(ActivationDetails *details, char ***strv) { + ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details); + char *s; + int r; + + assert(details); + assert(strv); + assert(p); + + if (isempty(p->trigger_path_filename)) + return 0; + + s = strjoin("TRIGGER_PATH=", p->trigger_path_filename); + if (!s) + return -ENOMEM; + + r = strv_consume(strv, TAKE_PTR(s)); + if (r < 0) + return r; + + return 1; /* Return the number of variables added to the env block */ +} + +static int activation_details_path_append_pair(ActivationDetails *details, char ***strv) { + ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details); + int r; + + assert(details); + assert(strv); + assert(p); + + if (isempty(p->trigger_path_filename)) + return 0; + + r = strv_extend(strv, "trigger_path"); + if (r < 0) + return r; + + r = strv_extend(strv, p->trigger_path_filename); + if (r < 0) + return r; + + return 1; /* Return the number of pairs added to the env block */ +} + +static const char* const path_type_table[_PATH_TYPE_MAX] = { + [PATH_EXISTS] = "PathExists", + [PATH_EXISTS_GLOB] = "PathExistsGlob", + [PATH_DIRECTORY_NOT_EMPTY] = "DirectoryNotEmpty", + [PATH_CHANGED] = "PathChanged", + [PATH_MODIFIED] = "PathModified", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_type, PathType); + +static const char* const path_result_table[_PATH_RESULT_MAX] = { + [PATH_SUCCESS] = "success", + [PATH_FAILURE_RESOURCES] = "resources", + [PATH_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [PATH_FAILURE_UNIT_START_LIMIT_HIT] = "unit-start-limit-hit", + [PATH_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_result, PathResult); + +const UnitVTable path_vtable = { + .object_size = sizeof(Path), + + .sections = + "Unit\0" + "Path\0" + "Install\0", + .private_section = "Path", + + .can_transient = true, + .can_fail = true, + .can_trigger = true, + + .init = path_init, + .done = path_done, + .load = path_load, + + .coldplug = path_coldplug, + + .dump = path_dump, + + .start = path_start, + .stop = path_stop, + + .serialize = path_serialize, + .deserialize_item = path_deserialize_item, + + .active_state = path_active_state, + .sub_state_to_string = path_sub_state_to_string, + + .trigger_notify = path_trigger_notify, + + .reset_failed = path_reset_failed, + + .bus_set_property = bus_path_set_property, + + .can_start = path_can_start, +}; + +const ActivationDetailsVTable activation_details_path_vtable = { + .object_size = sizeof(ActivationDetailsPath), + + .done = activation_details_path_done, + .serialize = activation_details_path_serialize, + .deserialize = activation_details_path_deserialize, + .append_env = activation_details_path_append_env, + .append_pair = activation_details_path_append_pair, +}; diff --git a/src/core/path.h b/src/core/path.h new file mode 100644 index 0000000..cb5b662 --- /dev/null +++ b/src/core/path.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Path Path; +typedef struct PathSpec PathSpec; +typedef struct ActivationDetailsPath ActivationDetailsPath; + +#include "unit.h" + +typedef enum PathType { + PATH_EXISTS, + PATH_EXISTS_GLOB, + PATH_DIRECTORY_NOT_EMPTY, + PATH_CHANGED, + PATH_MODIFIED, + _PATH_TYPE_MAX, + _PATH_TYPE_INVALID = -EINVAL, +} PathType; + +typedef struct PathSpec { + Unit *unit; + + char *path; + + sd_event_source *event_source; + + LIST_FIELDS(struct PathSpec, spec); + + PathType type; + int inotify_fd; + int primary_wd; + + bool previous_exists; +} PathSpec; + +int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler); +void path_spec_unwatch(PathSpec *s); +int path_spec_fd_event(PathSpec *s, uint32_t events); +void path_spec_done(PathSpec *s); + +static inline bool path_spec_owns_inotify_fd(PathSpec *s, int fd) { + return s->inotify_fd == fd; +} + +typedef enum PathResult { + PATH_SUCCESS, + PATH_FAILURE_RESOURCES, + PATH_FAILURE_START_LIMIT_HIT, + PATH_FAILURE_UNIT_START_LIMIT_HIT, + PATH_FAILURE_TRIGGER_LIMIT_HIT, + _PATH_RESULT_MAX, + _PATH_RESULT_INVALID = -EINVAL, +} PathResult; + +struct Path { + Unit meta; + + LIST_HEAD(PathSpec, specs); + + PathState state, deserialized_state; + + bool make_directory; + mode_t directory_mode; + + PathResult result; + + RateLimit trigger_limit; + + sd_event_source *trigger_notify_event_source; +}; + +struct ActivationDetailsPath { + ActivationDetails meta; + char *trigger_path_filename; +}; + +void path_free_specs(Path *p); + +extern const UnitVTable path_vtable; +extern const ActivationDetailsVTable activation_details_path_vtable; + +const char* path_type_to_string(PathType i) _const_; +PathType path_type_from_string(const char *s) _pure_; + +const char* path_result_to_string(PathResult i) _const_; +PathResult path_result_from_string(const char *s) _pure_; + +DEFINE_CAST(PATH, Path); +DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_PATH, ActivationDetailsPath, PATH); diff --git a/src/core/restrict-ifaces.c b/src/core/restrict-ifaces.c new file mode 100644 index 0000000..4dd8656 --- /dev/null +++ b/src/core/restrict-ifaces.c @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "restrict-ifaces.h" +#include "netlink-util.h" + +#if BPF_FRAMEWORK +/* libbpf, clang and llc compile time dependencies are satisfied */ + +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/restrict_ifaces/restrict-ifaces-skel.h" + +static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) { + restrict_ifaces_bpf__destroy(obj); + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_ifaces_bpf *, restrict_ifaces_bpf_free); + +static int prepare_restrict_ifaces_bpf( + Unit* u, + bool is_allow_list, + const Set *restrict_network_interfaces, + struct restrict_ifaces_bpf **ret_object) { + + _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + char *iface; + int r, map_fd; + + assert(ret_object); + + obj = restrict_ifaces_bpf__open(); + if (!obj) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "restrict-interfaces: Failed to open BPF object: %m"); + + r = sym_bpf_map__set_max_entries(obj->maps.sd_restrictif, MAX(set_size(restrict_network_interfaces), 1u)); + if (r != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, + "restrict-interfaces: Failed to resize BPF map '%s': %m", + sym_bpf_map__name(obj->maps.sd_restrictif)); + + obj->rodata->is_allow_list = is_allow_list; + + r = restrict_ifaces_bpf__load(obj); + if (r != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, r, "restrict-interfaces: Failed to load BPF object: %m"); + + map_fd = sym_bpf_map__fd(obj->maps.sd_restrictif); + + SET_FOREACH(iface, restrict_network_interfaces) { + uint8_t dummy = 0; + int ifindex; + + ifindex = rtnl_resolve_interface(&rtnl, iface); + if (ifindex < 0) { + log_unit_warning_errno(u, ifindex, + "restrict-interfaces: Couldn't find index of network interface '%s', ignoring: %m", + iface); + continue; + } + + if (sym_bpf_map_update_elem(map_fd, &ifindex, &dummy, BPF_ANY)) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, + "restrict-interfaces: Failed to update BPF map '%s' fd: %m", + sym_bpf_map__name(obj->maps.sd_restrictif)); + } + + *ret_object = TAKE_PTR(obj); + return 0; +} + +int restrict_network_interfaces_supported(void) { + _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + + if (!cgroup_bpf_supported()) + return (supported = false); + + if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) { + log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported"); + return (supported = false); + } + + r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj); + if (r < 0) { + log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m"); + return (supported = false); + } + + return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i)); +} + +static int restrict_network_interfaces_install_impl(Unit *u) { + _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL; + _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; + _cleanup_free_ char *cgroup_path = NULL; + _cleanup_close_ int cgroup_fd = -EBADF; + CGroupContext *cc; + int r; + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m"); + + if (!cc->restrict_network_interfaces) + return 0; + + r = prepare_restrict_ifaces_bpf(u, + cc->restrict_network_interfaces_is_allow_list, + cc->restrict_network_interfaces, + &obj); + if (r < 0) + return r; + + cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC | O_DIRECTORY, 0); + if (cgroup_fd < 0) + return -errno; + + ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd); + r = sym_libbpf_get_error(ingress_link); + if (r != 0) + return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m"); + + egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd); + r = sym_libbpf_get_error(egress_link); + if (r != 0) + return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m"); + + u->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link); + u->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link); + + return 0; +} + +int restrict_network_interfaces_install(Unit *u) { + int r = restrict_network_interfaces_install_impl(u); + fdset_close(u->initial_restric_ifaces_link_fds); + return r; +} + +int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + + r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_ingress_bpf_link); + if (r < 0) + return r; + + return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_egress_bpf_link); +} + +int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) { + int r; + + assert(u); + + if (!u->initial_restric_ifaces_link_fds) { + u->initial_restric_ifaces_link_fds = fdset_new(); + if (!u->initial_restric_ifaces_link_fds) + return log_oom(); + } + + r = fdset_put(u->initial_restric_ifaces_link_fds, fd); + if (r < 0) + return log_unit_error_errno(u, r, + "restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd); + + return 0; +} + +#else /* ! BPF_FRAMEWORK */ +int restrict_network_interfaces_supported(void) { + return 0; +} + +int restrict_network_interfaces_install(Unit *u) { + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m"); +} + +int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) { + return 0; +} + +int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) { + return 0; +} +#endif diff --git a/src/core/restrict-ifaces.h b/src/core/restrict-ifaces.h new file mode 100644 index 0000000..6e7a824 --- /dev/null +++ b/src/core/restrict-ifaces.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "fdset.h" +#include "unit.h" + +typedef struct Unit Unit; + +int restrict_network_interfaces_supported(void); +int restrict_network_interfaces_install(Unit *u); + +int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds); + +/* Add BPF link fd created before daemon-reload or daemon-reexec. + * FDs will be closed at the end of restrict_network_interfaces_install. */ +int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd); diff --git a/src/core/scope.c b/src/core/scope.c new file mode 100644 index 0000000..e4c27da --- /dev/null +++ b/src/core/scope.c @@ -0,0 +1,829 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "exit-status.h" +#include "load-dropin.h" +#include "log.h" +#include "process-util.h" +#include "random-util.h" +#include "scope.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" + +static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = { + [SCOPE_DEAD] = UNIT_INACTIVE, + [SCOPE_START_CHOWN] = UNIT_ACTIVATING, + [SCOPE_RUNNING] = UNIT_ACTIVE, + [SCOPE_ABANDONED] = UNIT_ACTIVE, + [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SCOPE_FAILED] = UNIT_FAILED, +}; + +static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); + +static void scope_init(Unit *u) { + Scope *s = SCOPE(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->runtime_max_usec = USEC_INFINITY; + s->timeout_stop_usec = u->manager->defaults.timeout_stop_usec; + u->ignore_on_isolate = true; + s->user = s->group = NULL; + s->oom_policy = _OOM_POLICY_INVALID; +} + +static void scope_done(Unit *u) { + Scope *s = SCOPE(u); + + assert(u); + + s->controller = mfree(s->controller); + s->controller_track = sd_bus_track_unref(s->controller_track); + + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + + s->user = mfree(s->user); + s->group = mfree(s->group); +} + +static usec_t scope_running_timeout(Scope *s) { + usec_t delta = 0; + + assert(s); + + if (s->runtime_rand_extra_usec != 0) { + delta = random_u64_range(s->runtime_rand_extra_usec); + log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC)); + } + + return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic, + s->runtime_max_usec), + delta); +} + +static int scope_arm_timer(Scope *s, bool relative, usec_t usec) { + assert(s); + + return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, scope_dispatch_timer); +} + +static void scope_set_state(Scope *s, ScopeState state) { + ScopeState old_state; + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL, SCOPE_START_CHOWN, SCOPE_RUNNING)) + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + + if (!IN_SET(old_state, SCOPE_DEAD, SCOPE_FAILED) && IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) { + unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } + + if (state != old_state) + log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static int scope_add_default_dependencies(Scope *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Make sure scopes are unloaded on shutdown */ + r = unit_add_two_dependencies_by_name( + UNIT(s), + UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_SHUTDOWN_TARGET, true, + UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int scope_verify(Scope *s) { + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (set_isempty(UNIT(s)->pids) && + !MANAGER_IS_RELOADING(UNIT(s)->manager) && + !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), "Scope has no PIDs. Refusing."); + + return 0; +} + +static int scope_load_init_scope(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_INIT_SCOPE)) + return 0; + + u->transient = true; + u->perpetual = true; + + /* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we + * synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + /* Prettify things, if we can. */ + if (!u->description) + u->description = strdup("System and Service Manager"); + if (!u->documentation) + (void) strv_extend(&u->documentation, "man:systemd(1)"); + + return 1; +} + +static int scope_add_extras(Scope *s) { + int r; + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + if (s->oom_policy < 0) + s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->defaults.oom_policy; + + s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL; + + return scope_add_default_dependencies(s); +} + +static int scope_load(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(u->load_state == UNIT_STUB); + + if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) + /* Refuse to load non-transient scope units, but allow them while reloading. */ + return -ENOENT; + + r = scope_load_init_scope(u); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin(u, false); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + r = scope_add_extras(s); + if (r < 0) + return r; + + return scope_verify(s); +} + +static usec_t scope_coldplug_timeout(Scope *s) { + assert(s); + + switch (s->deserialized_state) { + + case SCOPE_RUNNING: + return scope_running_timeout(s); + + case SCOPE_STOP_SIGKILL: + case SCOPE_STOP_SIGTERM: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec); + + default: + return USEC_INFINITY; + } +} + +static int scope_coldplug(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(s->state == SCOPE_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + r = scope_arm_timer(s, /* relative= */ false, scope_coldplug_timeout(s)); + if (r < 0) + return r; + + if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) { + if (u->pids) { + PidRef *pid; + + SET_FOREACH(pid, u->pids) { + r = unit_watch_pidref(u, pid, /* exclusive= */ false); + if (r < 0 && r != -EEXIST) + return r; + } + } else + (void) unit_enqueue_rewatch_pids(u); + } + + bus_scope_track_controller(s); + + scope_set_state(s, s->deserialized_state); + return 0; +} + +static void scope_dump(Unit *u, FILE *f, const char *prefix) { + Scope *s = SCOPE(u); + + assert(s); + assert(f); + + fprintf(f, + "%sScope State: %s\n" + "%sResult: %s\n" + "%sRuntimeMaxSec: %s\n" + "%sRuntimeRandomizedExtraSec: %s\n" + "%sOOMPolicy: %s\n", + prefix, scope_state_to_string(s->state), + prefix, scope_result_to_string(s->result), + prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC), + prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC), + prefix, oom_policy_to_string(s->oom_policy)); + + cgroup_context_dump(UNIT(s), f, prefix); + kill_context_dump(&s->kill_context, f, prefix); +} + +static void scope_enter_dead(Scope *s, ScopeResult f) { + assert(s); + + if (s->result == SCOPE_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result)); + scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD); +} + +static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) { + bool skip_signal = false; + int r; + + assert(s); + + if (s->result == SCOPE_SUCCESS) + s->result = f; + + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going + * directly into SIGTERM berserk mode */ + if (state == SCOPE_STOP_SIGTERM) + skip_signal = bus_scope_send_request_stop(s) > 0; + + if (skip_signal) + r = 1; /* wait */ + else { + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state != SCOPE_STOP_SIGTERM ? KILL_KILL : + s->was_abandoned ? KILL_TERMINATE_AND_LOG : + KILL_TERMINATE, + /* main_pid= */ NULL, + /* control_pid= */ NULL, + /* main_pid_alien= */ false); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + goto fail; + } + } + + if (r > 0) { + r = scope_arm_timer(s, /* relative= */ true, s->timeout_stop_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + goto fail; + } + + scope_set_state(s, state); + } else if (state == SCOPE_STOP_SIGTERM) + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS); + else + scope_enter_dead(s, SCOPE_SUCCESS); + + return; + +fail: + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); +} + +static int scope_enter_start_chown(Scope *s) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + Unit *u = UNIT(s); + int r; + + assert(s); + assert(s->user); + + r = scope_arm_timer(s, /* relative= */ true, u->manager->defaults.timeout_start_usec); + if (r < 0) + return r; + + r = unit_fork_helper_process(u, "(sd-chown-cgroup)", &pidref); + if (r < 0) + goto fail; + + if (r == 0) { + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + + if (!isempty(s->user)) { + const char *user = s->user; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve user \"%s\": %m", user); + _exit(EXIT_USER); + } + } + + if (!isempty(s->group)) { + const char *group = s->group; + + r = get_group_creds(&group, &gid, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve group \"%s\": %m", group); + _exit(EXIT_GROUP); + } + } + + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m"); + _exit(EXIT_CGROUP); + } + + _exit(EXIT_SUCCESS); + } + + r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true); + if (r < 0) + goto fail; + + scope_set_state(s, SCOPE_START_CHOWN); + + return 1; +fail: + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + return r; +} + +static int scope_enter_running(Scope *s) { + Unit *u = UNIT(s); + int r; + + assert(s); + + (void) bus_scope_track_controller(s); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + unit_export_state_files(u); + + r = unit_attach_pids_to_cgroup(u, u->pids, NULL); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to add PIDs to scope's control group: %m"); + goto fail; + } + if (r == 0) { + r = log_unit_warning_errno(u, SYNTHETIC_ERRNO(ECHILD), "No PIDs left to attach to the scope's control group, refusing."); + goto fail; + } + log_unit_debug(u, "%i %s added to scope's control group.", r, r == 1 ? "process" : "processes"); + + s->result = SCOPE_SUCCESS; + + scope_set_state(s, SCOPE_RUNNING); + + /* Set the maximum runtime timeout. */ + scope_arm_timer(s, /* relative= */ false, scope_running_timeout(s)); + + /* On unified we use proper notifications hence we can unwatch the PIDs + * we just attached to the scope. This can also be done on legacy as + * we're going to update the list of the processes we watch with the + * PIDs currently in the scope anyway. */ + unit_unwatch_all_pids(u); + + /* Start watching the PIDs currently in the scope (legacy hierarchy only) */ + (void) unit_enqueue_rewatch_pids(u); + return 1; + +fail: + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); + return r; +} + +static int scope_start(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return -EPERM; + + if (s->state == SCOPE_FAILED) + return -EPERM; + + /* We can't fulfill this right now, please try again later */ + if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + return -EAGAIN; + + assert(s->state == SCOPE_DEAD); + + if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) + return -ENOENT; + + (void) unit_realize_cgroup(u); + (void) unit_reset_accounting(u); + + /* We check only for User= option to keep behavior consistent with logic for service units, + * i.e. having 'Delegate=true Group=foo' w/o specifying User= has no effect. */ + if (s->user && unit_cgroup_delegate(u)) + return scope_enter_start_chown(s); + + return scope_enter_running(s); +} + +static int scope_stop(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + return 0; + + assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)); + + scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS); + return 1; +} + +static void scope_reset_failed(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (s->state == SCOPE_FAILED) + scope_set_state(s, SCOPE_DEAD); + + s->result = SCOPE_SUCCESS; +} + +static int scope_get_timeout(Unit *u, usec_t *timeout) { + Scope *s = SCOPE(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static int scope_serialize(Unit *u, FILE *f, FDSet *fds) { + Scope *s = SCOPE(u); + PidRef *pid; + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", scope_state_to_string(s->state)); + (void) serialize_bool(f, "was-abandoned", s->was_abandoned); + + if (s->controller) + (void) serialize_item(f, "controller", s->controller); + + SET_FOREACH(pid, u->pids) + serialize_pidref(f, fds, "pids", pid); + + return 0; +} + +static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Scope *s = SCOPE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + ScopeState state; + + state = scope_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + + } else if (streq(key, "was-abandoned")) { + int k; + + k = parse_boolean(value); + if (k < 0) + log_unit_debug(u, "Failed to parse boolean value: %s", value); + else + s->was_abandoned = k; + } else if (streq(key, "controller")) { + + r = free_and_strdup(&s->controller, value); + if (r < 0) + return log_oom(); + + } else if (streq(key, "pids")) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + + if (deserialize_pidref(fds, value, &pidref) >= 0) { + r = unit_watch_pidref(u, &pidref, /* exclusive= */ false); + if (r < 0) + log_unit_debug(u, "Failed to watch PID, ignoring: %s", value); + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void scope_notify_cgroup_empty_event(Unit *u) { + Scope *s = SCOPE(u); + assert(u); + + log_unit_debug(u, "cgroup is empty"); + + if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + scope_enter_dead(s, SCOPE_SUCCESS); +} + +static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) { + Scope *s = SCOPE(u); + + if (managed_oom) + log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd."); + else + log_unit_debug(u, "Process of control group was killed by the OOM killer."); + + if (s->oom_policy == OOM_CONTINUE) + return; + + switch (s->state) { + + case SCOPE_START_CHOWN: + case SCOPE_RUNNING: + scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_OOM_KILL); + break; + + case SCOPE_STOP_SIGTERM: + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_OOM_KILL); + break; + + case SCOPE_STOP_SIGKILL: + if (s->result == SCOPE_SUCCESS) + s->result = SCOPE_FAILURE_OOM_KILL; + break; + /* SCOPE_DEAD, SCOPE_ABANDONED, and SCOPE_FAILED end up in default */ + default: + ; + } +} + +static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Scope *s = SCOPE(u); + + assert(s); + + if (s->state == SCOPE_START_CHOWN) { + if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); + else + scope_enter_running(s); + return; + } + + /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to + * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original + * process we watched was probably the parent of them, and they are hence now our children. */ + + (void) unit_enqueue_rewatch_pids(u); +} + +static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Scope *s = SCOPE(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SCOPE_RUNNING: + log_unit_warning(UNIT(s), "Scope reached runtime time limit. Stopping."); + scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_TIMEOUT); + break; + + case SCOPE_STOP_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + } + + break; + + case SCOPE_STOP_SIGKILL: + log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + break; + + case SCOPE_START_CHOWN: + log_unit_warning(UNIT(s), "User lookup timed out. Entering failed state."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +int scope_abandon(Scope *s) { + assert(s); + + if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) + return -EPERM; + + if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)) + return -ESTALE; + + s->was_abandoned = true; + + s->controller = mfree(s->controller); + s->controller_track = sd_bus_track_unref(s->controller_track); + + scope_set_state(s, SCOPE_ABANDONED); + + /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that + * the remaining processes will be sooner or later reassigned to us as parent. */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + return 0; +} + +static UnitActiveState scope_active_state(Unit *u) { + assert(u); + + return state_translation_table[SCOPE(u)->state]; +} + +static const char *scope_sub_state_to_string(Unit *u) { + assert(u); + + return scope_state_to_string(SCOPE(u)->state); +} + +static void scope_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Let's unconditionally add the "init.scope" special unit + * that encapsulates PID 1. Note that PID 1 already is in the + * cgroup for this, we hence just need to allocate the object + * for it and that's it. */ + + u = manager_get_unit(m, SPECIAL_INIT_SCOPE); + if (!u) { + r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u); + if (r < 0) { + log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m"); + return; + } + } + + u->transient = true; + u->perpetual = true; + SCOPE(u)->deserialized_state = SCOPE_RUNNING; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); + /* Enqueue an explicit cgroup realization here. Unlike other cgroups this one already exists and is + * populated (by us, after all!) already, even when we are not in a reload cycle. Hence we cannot + * apply the settings at creation time anymore, but let's at least apply them asynchronously. */ + unit_add_to_cgroup_realize_queue(u); +} + +static const char* const scope_result_table[_SCOPE_RESULT_MAX] = { + [SCOPE_SUCCESS] = "success", + [SCOPE_FAILURE_RESOURCES] = "resources", + [SCOPE_FAILURE_TIMEOUT] = "timeout", + [SCOPE_FAILURE_OOM_KILL] = "oom-kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult); + +const UnitVTable scope_vtable = { + .object_size = sizeof(Scope), + .cgroup_context_offset = offsetof(Scope, cgroup_context), + .kill_context_offset = offsetof(Scope, kill_context), + + .sections = + "Unit\0" + "Scope\0" + "Install\0", + .private_section = "Scope", + + .can_transient = true, + .can_delegate = true, + .can_fail = true, + .once_only = true, + .can_set_managed_oom = true, + + .init = scope_init, + .load = scope_load, + .done = scope_done, + + .coldplug = scope_coldplug, + + .dump = scope_dump, + + .start = scope_start, + .stop = scope_stop, + + .freeze = unit_freeze_vtable_common, + .thaw = unit_thaw_vtable_common, + + .get_timeout = scope_get_timeout, + + .serialize = scope_serialize, + .deserialize_item = scope_deserialize_item, + + .active_state = scope_active_state, + .sub_state_to_string = scope_sub_state_to_string, + + .sigchld_event = scope_sigchld_event, + + .reset_failed = scope_reset_failed, + + .notify_cgroup_empty = scope_notify_cgroup_empty_event, + .notify_cgroup_oom = scope_notify_cgroup_oom_event, + + .bus_set_property = bus_scope_set_property, + .bus_commit_properties = bus_scope_commit_properties, + + .enumerate_perpetual = scope_enumerate_perpetual, +}; diff --git a/src/core/scope.h b/src/core/scope.h new file mode 100644 index 0000000..c9574a3 --- /dev/null +++ b/src/core/scope.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Scope Scope; + +#include "cgroup.h" +#include "kill.h" +#include "unit.h" + +typedef enum ScopeResult { + SCOPE_SUCCESS, + SCOPE_FAILURE_RESOURCES, + SCOPE_FAILURE_TIMEOUT, + SCOPE_FAILURE_OOM_KILL, + _SCOPE_RESULT_MAX, + _SCOPE_RESULT_INVALID = -EINVAL, +} ScopeResult; + +struct Scope { + Unit meta; + + CGroupContext cgroup_context; + KillContext kill_context; + + ScopeState state, deserialized_state; + ScopeResult result; + + usec_t runtime_max_usec; + usec_t runtime_rand_extra_usec; + usec_t timeout_stop_usec; + + char *controller; + sd_bus_track *controller_track; + + bool was_abandoned; + + sd_event_source *timer_event_source; + + char *user; + char *group; + + OOMPolicy oom_policy; +}; + +extern const UnitVTable scope_vtable; + +int scope_abandon(Scope *s); + +const char* scope_result_to_string(ScopeResult i) _const_; +ScopeResult scope_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SCOPE, Scope); diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c new file mode 100644 index 0000000..62181a6 --- /dev/null +++ b/src/core/selinux-access.c @@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "selinux-access.h" + +#if HAVE_SELINUX + +#include +#include +#include +#if HAVE_AUDIT +#include +#endif + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "audit-fd.h" +#include "bus-util.h" +#include "errno-util.h" +#include "format-util.h" +#include "log.h" +#include "path-util.h" +#include "selinux-util.h" +#include "stdio-util.h" +#include "strv.h" + +static bool initialized = false; + +struct audit_info { + sd_bus_creds *creds; + const char *path; + const char *cmdline; + const char *function; +}; + +/* + Any time an access gets denied this callback will be called + with the audit data. We then need to just copy the audit data into the msgbuf. +*/ +static int audit_callback( + void *auditdata, + security_class_t cls, + char *msgbuf, + size_t msgbufsize) { + + const struct audit_info *audit = auditdata; + uid_t uid = 0, login_uid = 0; + gid_t gid = 0; + char login_uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a"; + char uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a"; + char gid_buf[DECIMAL_STR_MAX(gid_t) + 1] = "n/a"; + + if (sd_bus_creds_get_audit_login_uid(audit->creds, &login_uid) >= 0) + xsprintf(login_uid_buf, UID_FMT, login_uid); + if (sd_bus_creds_get_euid(audit->creds, &uid) >= 0) + xsprintf(uid_buf, UID_FMT, uid); + if (sd_bus_creds_get_egid(audit->creds, &gid) >= 0) + xsprintf(gid_buf, GID_FMT, gid); + + (void) snprintf(msgbuf, msgbufsize, + "auid=%s uid=%s gid=%s%s%s%s%s%s%s%s%s%s", + login_uid_buf, uid_buf, gid_buf, + audit->path ? " path=\"" : "", strempty(audit->path), audit->path ? "\"" : "", + audit->cmdline ? " cmdline=\"" : "", strempty(audit->cmdline), audit->cmdline ? "\"" : "", + audit->function ? " function=\"" : "", strempty(audit->function), audit->function ? "\"" : ""); + + return 0; +} + +static int callback_type_to_priority(int type) { + switch (type) { + + case SELINUX_ERROR: + return LOG_ERR; + + case SELINUX_WARNING: + return LOG_WARNING; + + case SELINUX_INFO: + return LOG_INFO; + + case SELINUX_AVC: + default: + return LOG_NOTICE; + } +} + +/* + libselinux uses this callback when access gets denied or other + events happen. If audit is turned on, messages will be reported + using audit netlink, otherwise they will be logged using the usual + channels. + + Code copied from dbus and modified. +*/ +_printf_(2, 3) static int log_callback(int type, const char *fmt, ...) { + va_list ap; + const char *fmt2; + +#if HAVE_AUDIT + int fd; + + fd = get_audit_fd(); + + if (fd >= 0) { + _cleanup_free_ char *buf = NULL; + int r; + + va_start(ap, fmt); + r = vasprintf(&buf, fmt, ap); + va_end(ap); + + if (r >= 0) { + if (type == SELINUX_AVC) + audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_AVC, buf, NULL, NULL, NULL, getuid()); + else if (type == SELINUX_ERROR) + audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_SELINUX_ERR, buf, NULL, NULL, NULL, getuid()); + + return 0; + } + } +#endif + + fmt2 = strjoina("selinux: ", fmt); + + va_start(ap, fmt); + + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_AUTH | callback_type_to_priority(type), + 0, PROJECT_FILE, __LINE__, __func__, + fmt2, ap); + REENABLE_WARNING; + va_end(ap); + + return 0; +} + +static int access_init(sd_bus_error *error) { + int r; + + if (!mac_selinux_use()) + return 0; + + if (initialized) + return 1; + + if (avc_open(NULL, 0) != 0) { + r = -errno; /* Save original errno for later */ + + bool enforce = security_getenforce() != 0; + log_full_errno(enforce ? LOG_ERR : LOG_WARNING, r, "Failed to open the SELinux AVC: %m"); + + /* If enforcement isn't on, then let's suppress this error, and just don't do any AVC checks. + * The warning we printed is hence all the admin will see. */ + if (!enforce) + return 0; + + /* Return an access denied error based on the original errno, if we couldn't load the AVC but + * enforcing mode was on, or we couldn't determine whether it is one. */ + errno = -r; + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to open the SELinux AVC: %m"); + } + + selinux_set_callback(SELINUX_CB_AUDIT, (union selinux_callback) { .func_audit = audit_callback }); + selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) { .func_log = log_callback }); + + initialized = true; + return 1; +} + +/* + This function communicates with the kernel to check whether or not it should + allow the access. + If the machine is in permissive mode it will return ok. Audit messages will + still be generated if the access would be denied in enforcing mode. +*/ +int mac_selinux_access_check_internal( + sd_bus_message *message, + const char *unit_path, + const char *unit_context, + const char *permission, + const char *function, + sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *tclass, *scon, *acon; + _cleanup_free_ char *cl = NULL; + _cleanup_freecon_ char *fcon = NULL; + char **cmdline = NULL; + bool enforce; + int r = 0; + + assert(message); + assert(permission); + assert(function); + assert(error); + + r = access_init(error); + if (r <= 0) + return r; + + /* delay call until we checked in `access_init()` if SELinux is actually enabled */ + enforce = mac_selinux_enforcing(); + + r = sd_bus_query_sender_creds( + message, + SD_BUS_CREDS_PID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID| + SD_BUS_CREDS_CMDLINE|SD_BUS_CREDS_AUDIT_LOGIN_UID| + SD_BUS_CREDS_SELINUX_CONTEXT| + SD_BUS_CREDS_AUGMENT /* get more bits from /proc */, + &creds); + if (r < 0) + return r; + + /* The SELinux context is something we really should have gotten directly from the message or sender, + * and not be an augmented field. If it was augmented we cannot use it for authorization, since this + * is racy and vulnerable. Let's add an extra check, just in case, even though this really shouldn't + * be possible. */ + assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_SELINUX_CONTEXT) == 0, -EPERM); + + r = sd_bus_creds_get_selinux_context(creds, &scon); + if (r < 0) + return r; + + if (unit_context) { + /* Nice! The unit comes with a SELinux context read from the unit file */ + acon = unit_context; + tclass = "service"; + } else { + /* If no unit context is known, use our own */ + if (getcon_raw(&fcon) < 0) { + log_warning_errno(errno, "SELinux getcon_raw() failed%s (perm=%s): %m", + enforce ? "" : ", ignoring", + permission); + if (!enforce) + return 0; + + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get current context: %m"); + } + if (!fcon) { + if (!enforce) + return 0; + + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "We appear not to have any SELinux context: %m"); + } + + acon = fcon; + tclass = "system"; + } + + sd_bus_creds_get_cmdline(creds, &cmdline); + cl = strv_join(cmdline, " "); + + struct audit_info audit_info = { + .creds = creds, + .path = unit_path, + .cmdline = cl, + .function = function, + }; + + r = selinux_check_access(scon, acon, tclass, permission, &audit_info); + if (r < 0) { + errno = -(r = errno_or_else(EPERM)); + + if (enforce) + sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "SELinux policy denies access: %m"); + } + + log_full_errno_zerook(LOG_DEBUG, r, + "SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s function=%s path=%s cmdline=%s: %m", + scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), strna(empty_to_null(cl))); + return enforce ? r : 0; +} + +#else /* HAVE_SELINUX */ + +int mac_selinux_access_check_internal( + sd_bus_message *message, + const char *unit_path, + const char *unit_label, + const char *permission, + const char *function, + sd_bus_error *error) { + + return 0; +} + +#endif /* HAVE_SELINUX */ diff --git a/src/core/selinux-access.h b/src/core/selinux-access.h new file mode 100644 index 0000000..dc8da9e --- /dev/null +++ b/src/core/selinux-access.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "manager.h" + +int mac_selinux_access_check_internal(sd_bus_message *message, const char *unit_path, const char *unit_label, const char *permission, const char *function, sd_bus_error *error); + +#define mac_selinux_access_check(message, permission, error) \ + mac_selinux_access_check_internal((message), NULL, NULL, (permission), __func__, (error)) + +#define mac_selinux_unit_access_check(unit, message, permission, error) \ + mac_selinux_access_check_internal((message), (unit)->fragment_path, (unit)->access_selinux_context, (permission), __func__, (error)) diff --git a/src/core/selinux-setup.c b/src/core/selinux-setup.c new file mode 100644 index 0000000..bc1a249 --- /dev/null +++ b/src/core/selinux-setup.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#if HAVE_SELINUX +#include +#endif + +#include "sd-messages.h" + +#include "initrd-util.h" +#include "log.h" +#include "macro.h" +#include "selinux-setup.h" +#include "selinux-util.h" +#include "string-util.h" +#include "time-util.h" + +#if HAVE_SELINUX +_printf_(2,3) +static int null_log(int type, const char *fmt, ...) { + return 0; +} +#endif + +int mac_selinux_setup(bool *loaded_policy) { + +#if HAVE_SELINUX + int enforce = 0; + usec_t before_load, after_load; + char *con; + int r; + bool initialized; + + assert(loaded_policy); + + /* Turn off all of SELinux' own logging, we want to do that */ + selinux_set_callback(SELINUX_CB_LOG, (const union selinux_callback) { .func_log = null_log }); + + /* Don't load policy in the initrd if we don't appear to have it. For the real root, we check below + * if we've already loaded policy, and return gracefully. */ + if (in_initrd() && access(selinux_path(), F_OK) < 0) + return 0; + + /* Already initialized by somebody else? */ + r = getcon_raw(&con); + /* getcon_raw can return 0, and still give us a NULL pointer if /proc/self/attr/current is + * empty. SELinux guarantees this won't happen, but that file isn't specific to SELinux, and may be + * provided by some other arbitrary LSM with different semantics. */ + if (r == 0 && con) { + initialized = !streq(con, "kernel"); + freecon(con); + } else + initialized = false; + + /* Make sure we have no fds open while loading the policy and + * transitioning */ + log_close(); + + /* Now load the policy */ + before_load = now(CLOCK_MONOTONIC); + r = selinux_init_load_policy(&enforce); + if (r == 0) { + _cleanup_(mac_selinux_freep) char *label = NULL; + + mac_selinux_retest(); + + /* Transition to the new context */ + r = mac_selinux_get_create_label_from_exe(SYSTEMD_BINARY_PATH, &label); + if (r < 0 || !label) { + log_open(); + log_error("Failed to compute init label, ignoring."); + } else { + r = setcon_raw(label); + + log_open(); + if (r < 0) + log_error("Failed to transition into init label '%s', ignoring.", label); + } + + after_load = now(CLOCK_MONOTONIC); + + log_info("Successfully loaded SELinux policy in %s.", + FORMAT_TIMESPAN(after_load - before_load, 0)); + + *loaded_policy = true; + + } else { + log_open(); + + if (enforce > 0) { + if (!initialized) + return log_struct_errno(LOG_EMERG, SYNTHETIC_ERRNO(EIO), + LOG_MESSAGE("Failed to load SELinux policy :%m"), + "MESSAGE_ID=" SD_MESSAGE_SELINUX_FAILED_STR); + + log_warning("Failed to load new SELinux policy. Continuing with old policy."); + } else + log_debug("Unable to load SELinux policy. Ignoring."); + } +#endif + + return 0; +} diff --git a/src/core/selinux-setup.h b/src/core/selinux-setup.h new file mode 100644 index 0000000..cdff51d --- /dev/null +++ b/src/core/selinux-setup.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int mac_selinux_setup(bool *loaded_policy); diff --git a/src/core/service.c b/src/core/service.c new file mode 100644 index 0000000..060ac08 --- /dev/null +++ b/src/core/service.c @@ -0,0 +1,5161 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-util.h" +#include "chase.h" +#include "constants.h" +#include "dbus-service.h" +#include "dbus-unit.h" +#include "devnum-util.h" +#include "env-util.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "manager.h" +#include "missing_audit.h" +#include "open-file.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "serialize.h" +#include "service.h" +#include "signal-util.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "utf8.h" + +#define service_spawn(...) service_spawn_internal(__func__, __VA_ARGS__) + +static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = { + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_CONDITION] = UNIT_ACTIVATING, + [SERVICE_START_PRE] = UNIT_ACTIVATING, + [SERVICE_START] = UNIT_ACTIVATING, + [SERVICE_START_POST] = UNIT_ACTIVATING, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING, + [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE, + [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED, + [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, + [SERVICE_CLEANING] = UNIT_MAINTENANCE, +}; + +/* For Type=idle we never want to delay any other jobs, hence we + * consider idle jobs active as soon as we start working on them */ +static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = { + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_CONDITION] = UNIT_ACTIVE, + [SERVICE_START_PRE] = UNIT_ACTIVE, + [SERVICE_START] = UNIT_ACTIVE, + [SERVICE_START_POST] = UNIT_ACTIVE, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING, + [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE, + [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED, + [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, + [SERVICE_CLEANING] = UNIT_MAINTENANCE, +}; + +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata); +static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata); + +static void service_enter_signal(Service *s, ServiceState state, ServiceResult f); +static void service_enter_reload_by_notify(Service *s); + +static void service_init(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->timeout_start_usec = u->manager->defaults.timeout_start_usec; + s->timeout_stop_usec = u->manager->defaults.timeout_stop_usec; + s->timeout_abort_usec = u->manager->defaults.timeout_abort_usec; + s->timeout_abort_set = u->manager->defaults.timeout_abort_set; + s->restart_usec = u->manager->defaults.restart_usec; + s->restart_max_delay_usec = USEC_INFINITY; + s->runtime_max_usec = USEC_INFINITY; + s->type = _SERVICE_TYPE_INVALID; + s->socket_fd = -EBADF; + s->stdin_fd = s->stdout_fd = s->stderr_fd = -EBADF; + s->guess_main_pid = true; + s->main_pid = PIDREF_NULL; + s->control_pid = PIDREF_NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + s->exec_context.keyring_mode = MANAGER_IS_SYSTEM(u->manager) ? + EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT; + + s->notify_access_override = _NOTIFY_ACCESS_INVALID; + + s->watchdog_original_usec = USEC_INFINITY; + + s->oom_policy = _OOM_POLICY_INVALID; + s->reload_begin_usec = USEC_INFINITY; + s->reload_signal = SIGHUP; + + s->fd_store_preserve_mode = EXEC_PRESERVE_RESTART; +} + +static void service_unwatch_control_pid(Service *s) { + assert(s); + + if (!pidref_is_set(&s->control_pid)) + return; + + unit_unwatch_pidref(UNIT(s), &s->control_pid); + pidref_done(&s->control_pid); +} + +static void service_unwatch_main_pid(Service *s) { + assert(s); + + if (!pidref_is_set(&s->main_pid)) + return; + + unit_unwatch_pidref(UNIT(s), &s->main_pid); + pidref_done(&s->main_pid); +} + +static void service_unwatch_pid_file(Service *s) { + if (!s->pid_file_pathspec) + return; + + log_unit_debug(UNIT(s), "Stopping watch for PID file %s", s->pid_file_pathspec->path); + path_spec_unwatch(s->pid_file_pathspec); + path_spec_done(s->pid_file_pathspec); + s->pid_file_pathspec = mfree(s->pid_file_pathspec); +} + +static int service_set_main_pidref(Service *s, PidRef *pidref) { + int r; + + assert(s); + + /* Takes ownership of the specified pidref on success, but not on failure. */ + + if (!pidref_is_set(pidref)) + return -ESRCH; + + if (pidref->pid <= 1) + return -EINVAL; + + if (pidref_is_self(pidref)) + return -EINVAL; + + if (pidref_equal(&s->main_pid, pidref) && s->main_pid_known) { + pidref_done(pidref); + return 0; + } + + if (!pidref_equal(&s->main_pid, pidref)) { + service_unwatch_main_pid(s); + exec_status_start(&s->main_exec_status, pidref->pid); + } + + s->main_pid = TAKE_PIDREF(*pidref); + s->main_pid_known = true; + + r = pidref_is_my_child(&s->main_pid); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Can't determine if process "PID_FMT" is our child, assuming it is not: %m", s->main_pid.pid); + else if (r == 0) + log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", s->main_pid.pid); + + s->main_pid_alien = r <= 0; + return 0; +} + +void service_release_socket_fd(Service *s) { + assert(s); + + if (s->socket_fd < 0 && !UNIT_ISSET(s->accept_socket) && !s->socket_peer) + return; + + log_unit_debug(UNIT(s), "Closing connection socket."); + + /* Undo the effect of service_set_socket_fd(). */ + + s->socket_fd = asynchronous_close(s->socket_fd); + + if (UNIT_ISSET(s->accept_socket)) { + socket_connection_unref(SOCKET(UNIT_DEREF(s->accept_socket))); + unit_ref_unset(&s->accept_socket); + } + + s->socket_peer = socket_peer_unref(s->socket_peer); +} + +static void service_override_notify_access(Service *s, NotifyAccess notify_access_override) { + assert(s); + + s->notify_access_override = notify_access_override; + + log_unit_debug(UNIT(s), "notify_access=%s", notify_access_to_string(s->notify_access)); + log_unit_debug(UNIT(s), "notify_access_override=%s", notify_access_to_string(s->notify_access_override)); +} + +static void service_stop_watchdog(Service *s) { + assert(s); + + s->watchdog_event_source = sd_event_source_disable_unref(s->watchdog_event_source); + s->watchdog_timestamp = DUAL_TIMESTAMP_NULL; +} + +static void service_start_watchdog(Service *s) { + usec_t watchdog_usec; + int r; + + assert(s); + + watchdog_usec = service_get_watchdog_usec(s); + if (!timestamp_is_set(watchdog_usec)) { + service_stop_watchdog(s); + return; + } + + if (s->watchdog_event_source) { + r = sd_event_source_set_time(s->watchdog_event_source, usec_add(s->watchdog_timestamp.monotonic, watchdog_usec)); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to reset watchdog timer: %m"); + return; + } + + r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ONESHOT); + } else { + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->watchdog_event_source, + CLOCK_MONOTONIC, + usec_add(s->watchdog_timestamp.monotonic, watchdog_usec), 0, + service_dispatch_watchdog, s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to add watchdog timer: %m"); + return; + } + + (void) sd_event_source_set_description(s->watchdog_event_source, "service-watchdog"); + + /* Let's process everything else which might be a sign + * of living before we consider a service died. */ + r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE); + } + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m"); +} + +usec_t service_restart_usec_next(Service *s) { + unsigned n_restarts_next; + + assert(s); + + /* When the service state is in SERVICE_*_BEFORE_AUTO_RESTART or SERVICE_AUTO_RESTART, we still need + * to add 1 to s->n_restarts manually, because s->n_restarts is not updated until a restart job is + * enqueued, i.e. state has transitioned to SERVICE_AUTO_RESTART_QUEUED. */ + n_restarts_next = s->n_restarts + (s->state == SERVICE_AUTO_RESTART_QUEUED ? 0 : 1); + + if (n_restarts_next <= 1 || + s->restart_steps == 0 || + s->restart_usec == 0 || + s->restart_max_delay_usec == USEC_INFINITY || + s->restart_usec >= s->restart_max_delay_usec) + return s->restart_usec; + + if (n_restarts_next > s->restart_steps) + return s->restart_max_delay_usec; + + /* Enforced in service_verify() and above */ + assert(s->restart_max_delay_usec > s->restart_usec); + + /* r_i / r_0 = (r_n / r_0) ^ (i / n) + * where, + * r_0 : initial restart usec (s->restart_usec), + * r_i : i-th restart usec (value), + * r_n : maximum restart usec (s->restart_max_delay_usec), + * i : index of the next step (n_restarts_next - 1) + * n : num maximum steps (s->restart_steps) */ + return (usec_t) (s->restart_usec * powl((long double) s->restart_max_delay_usec / s->restart_usec, + (long double) (n_restarts_next - 1) / s->restart_steps)); +} + +static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) { + usec_t current; + int r; + + assert(s); + + /* Extends the specified event source timer to at least the specified time, unless it is already later + * anyway. */ + + if (!source) + return; + + r = sd_event_source_get_time(source, ¤t); + if (r < 0) { + const char *desc; + (void) sd_event_source_get_description(s->timer_event_source, &desc); + log_unit_warning_errno(UNIT(s), r, "Failed to retrieve timeout time for event source '%s', ignoring: %m", strna(desc)); + return; + } + + if (current >= extended) /* Current timeout is already longer, ignore this. */ + return; + + r = sd_event_source_set_time(source, extended); + if (r < 0) { + const char *desc; + (void) sd_event_source_get_description(s->timer_event_source, &desc); + log_unit_warning_errno(UNIT(s), r, "Failed to set timeout time for event source '%s', ignoring %m", strna(desc)); + } +} + +static void service_extend_timeout(Service *s, usec_t extend_timeout_usec) { + usec_t extended; + + assert(s); + + if (!timestamp_is_set(extend_timeout_usec)) + return; + + extended = usec_add(now(CLOCK_MONOTONIC), extend_timeout_usec); + + service_extend_event_source_timeout(s, s->timer_event_source, extended); + service_extend_event_source_timeout(s, s->watchdog_event_source, extended); +} + +static void service_reset_watchdog(Service *s) { + assert(s); + + dual_timestamp_now(&s->watchdog_timestamp); + service_start_watchdog(s); +} + +static void service_override_watchdog_timeout(Service *s, usec_t watchdog_override_usec) { + assert(s); + + s->watchdog_override_enable = true; + s->watchdog_override_usec = watchdog_override_usec; + service_reset_watchdog(s); + + log_unit_debug(UNIT(s), "watchdog_usec="USEC_FMT, s->watchdog_usec); + log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec); +} + +static ServiceFDStore* service_fd_store_unlink(ServiceFDStore *fs) { + if (!fs) + return NULL; + + if (fs->service) { + assert(fs->service->n_fd_store > 0); + LIST_REMOVE(fd_store, fs->service->fd_store, fs); + fs->service->n_fd_store--; + } + + sd_event_source_disable_unref(fs->event_source); + + free(fs->fdname); + asynchronous_close(fs->fd); + return mfree(fs); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(ServiceFDStore*, service_fd_store_unlink); + +static void service_release_fd_store(Service *s) { + assert(s); + + if (!s->fd_store) + return; + + log_unit_debug(UNIT(s), "Releasing all stored fds"); + + while (s->fd_store) + service_fd_store_unlink(s->fd_store); + + assert(s->n_fd_store == 0); +} + +static void service_release_stdio_fd(Service *s) { + assert(s); + + if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stdout_fd < 0) + return; + + log_unit_debug(UNIT(s), "Releasing stdin/stdout/stderr file descriptors."); + + s->stdin_fd = asynchronous_close(s->stdin_fd); + s->stdout_fd = asynchronous_close(s->stdout_fd); + s->stderr_fd = asynchronous_close(s->stderr_fd); +} +static void service_done(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + open_file_free_many(&s->open_files); + + s->pid_file = mfree(s->pid_file); + s->status_text = mfree(s->status_text); + + s->exec_runtime = exec_runtime_free(s->exec_runtime); + exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); + s->control_command = NULL; + s->main_command = NULL; + + exit_status_set_free(&s->restart_prevent_status); + exit_status_set_free(&s->restart_force_status); + exit_status_set_free(&s->success_status); + + /* This will leak a process, but at least no memory or any of our resources */ + service_unwatch_main_pid(s); + service_unwatch_control_pid(s); + service_unwatch_pid_file(s); + + if (s->bus_name) { + unit_unwatch_bus_name(u, s->bus_name); + s->bus_name = mfree(s->bus_name); + } + + s->bus_name_owner = mfree(s->bus_name_owner); + + s->usb_function_descriptors = mfree(s->usb_function_descriptors); + s->usb_function_strings = mfree(s->usb_function_strings); + + service_stop_watchdog(s); + + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); + + s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot); + + service_release_socket_fd(s); + service_release_stdio_fd(s); + service_release_fd_store(s); +} + +static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) { + ServiceFDStore *fs = ASSERT_PTR(userdata); + + assert(e); + + /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */ + log_unit_debug(UNIT(fs->service), + "Received %s on stored fd %d (%s), closing.", + revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP", + fs->fd, strna(fs->fdname)); + service_fd_store_unlink(fs); + return 0; +} + +static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do_poll) { + _cleanup_(service_fd_store_unlinkp) ServiceFDStore *fs = NULL; + _cleanup_(asynchronous_closep) int fd = ASSERT_FD(fd_in); + struct stat st; + int r; + + /* fd is always consumed even if the function fails. */ + + assert(s); + + if (fstat(fd, &st) < 0) + return -errno; + + log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64, DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino); + + if (s->n_fd_store >= s->n_fd_store_max) + /* Our store is full. Use this errno rather than E[NM]FILE to distinguish from the case + * where systemd itself hits the file limit. */ + return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(EXFULL), "Hit fd store limit."); + + LIST_FOREACH(fd_store, i, s->fd_store) { + r = same_fd(i->fd, fd); + if (r < 0) + return r; + if (r > 0) { + log_unit_debug(UNIT(s), "Suppressing duplicate fd %i in fd store.", fd); + return 0; /* fd already included */ + } + } + + fs = new(ServiceFDStore, 1); + if (!fs) + return -ENOMEM; + + *fs = (ServiceFDStore) { + .fd = TAKE_FD(fd), + .do_poll = do_poll, + .fdname = strdup(name ?: "stored"), + }; + + if (!fs->fdname) + return -ENOMEM; + + if (do_poll) { + r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fs->fd, 0, on_fd_store_io, fs); + if (r < 0 && r != -EPERM) /* EPERM indicates fds that aren't pollable, which is OK */ + return r; + else if (r >= 0) + (void) sd_event_source_set_description(fs->event_source, "service-fd-store"); + } + + fs->service = s; + LIST_PREPEND(fd_store, s->fd_store, fs); + s->n_fd_store++; + + log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname); + + TAKE_PTR(fs); + return 1; /* fd newly stored */ +} + +static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll) { + int r; + + assert(s); + + for (;;) { + int fd; + + fd = fdset_steal_first(fds); + if (fd < 0) + break; + + r = service_add_fd_store(s, fd, name, do_poll); + if (r == -EXFULL) + return log_unit_warning_errno(UNIT(s), r, + "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.", + s->n_fd_store_max); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m"); + } + + return 0; +} + +static void service_remove_fd_store(Service *s, const char *name) { + assert(s); + assert(name); + + LIST_FOREACH(fd_store, fs, s->fd_store) { + if (!streq(fs->fdname, name)) + continue; + + log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name); + service_fd_store_unlink(fs); + } +} + +static usec_t service_running_timeout(Service *s) { + usec_t delta = 0; + + assert(s); + + if (s->runtime_rand_extra_usec != 0) { + delta = random_u64_range(s->runtime_rand_extra_usec); + log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC)); + } + + return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic, + s->runtime_max_usec), + delta); +} + +static int service_arm_timer(Service *s, bool relative, usec_t usec) { + assert(s); + + return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, service_dispatch_timer); +} + +static int service_verify(Service *s) { + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) + LIST_FOREACH(command, command, s->exec_command[c]) { + if (!path_is_absolute(command->path) && !filename_is_valid(command->path)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), + "Service %s= binary path \"%s\" is neither a valid executable name nor an absolute path. Refusing.", + command->path, + service_exec_command_to_string(c)); + if (strv_isempty(command->argv)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), + "Service has an empty argv in %s=. Refusing.", + service_exec_command_to_string(c)); + } + + if (!s->exec_command[SERVICE_EXEC_START] && !s->exec_command[SERVICE_EXEC_STOP] && + UNIT(s)->success_action == EMERGENCY_ACTION_NONE) + /* FailureAction= only makes sense if one of the start or stop commands is specified. + * SuccessAction= will be executed unconditionally if no commands are specified. Hence, + * either a command or SuccessAction= are required. */ + + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart=, ExecStop=, or SuccessAction=. Refusing."); + + if (s->type != SERVICE_ONESHOT && !s->exec_command[SERVICE_EXEC_START]) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= setting, which is only allowed for Type=oneshot services. Refusing."); + + if (!s->remain_after_exit && !s->exec_command[SERVICE_EXEC_START] && UNIT(s)->success_action == EMERGENCY_ACTION_NONE) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= and no SuccessAction= settings and does not have RemainAfterExit=yes set. Refusing."); + + if (s->type != SERVICE_ONESHOT && s->exec_command[SERVICE_EXEC_START]->command_next) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. Refusing."); + + if (s->type == SERVICE_ONESHOT && IN_SET(s->restart, SERVICE_RESTART_ALWAYS, SERVICE_RESTART_ON_SUCCESS)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing."); + + if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has RestartForceExitStatus= set, which isn't allowed for Type=oneshot services. Refusing."); + + if (s->type == SERVICE_ONESHOT && s->exit_type == SERVICE_EXIT_CGROUP) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has ExitType=cgroup set, which isn't allowed for Type=oneshot services. Refusing."); + + if (s->type == SERVICE_DBUS && !s->bus_name) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing."); + + if (s->exec_context.pam_name && !IN_SET(s->kill_context.kill_mode, KILL_CONTROL_GROUP, KILL_MIXED)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing."); + + if (s->usb_function_descriptors && !s->usb_function_strings) + log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring."); + + if (!s->usb_function_descriptors && s->usb_function_strings) + log_unit_warning(UNIT(s), "Service has USBFunctionStrings= setting, but no USBFunctionDescriptors=. Ignoring."); + + if (s->runtime_max_usec != USEC_INFINITY && s->type == SERVICE_ONESHOT) + log_unit_warning(UNIT(s), "RuntimeMaxSec= has no effect in combination with Type=oneshot. Ignoring."); + + if (s->runtime_max_usec == USEC_INFINITY && s->runtime_rand_extra_usec != 0) + log_unit_warning(UNIT(s), "Service has RuntimeRandomizedExtraSec= setting, but no RuntimeMaxSec=. Ignoring."); + + if (s->exit_type == SERVICE_EXIT_CGROUP && cg_unified() < CGROUP_UNIFIED_SYSTEMD) + log_unit_warning(UNIT(s), "Service has ExitType=cgroup set, but we are running with legacy cgroups v1, which might not work correctly. Continuing."); + + if (s->restart_max_delay_usec == USEC_INFINITY && s->restart_steps > 0) + log_unit_warning(UNIT(s), "Service has RestartSteps= but no RestartMaxDelaySec= setting. Ignoring."); + + if (s->restart_max_delay_usec != USEC_INFINITY && s->restart_steps == 0) + log_unit_warning(UNIT(s), "Service has RestartMaxDelaySec= but no RestartSteps= setting. Ignoring."); + + if (s->restart_max_delay_usec < s->restart_usec) { + log_unit_warning(UNIT(s), "RestartMaxDelaySec= has a value smaller than RestartSec=, resetting RestartSec= to RestartMaxDelaySec=."); + s->restart_usec = s->restart_max_delay_usec; + } + + return 0; +} + +static int service_add_default_dependencies(Service *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Add a number of automatic dependencies useful for the + * majority of services. */ + + if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) { + /* First, pull in the really early boot stuff, and + * require it, so that we fail if we can't acquire + * it. */ + + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } else { + + /* In the --user instance there's no sysinit.target, + * in that case require basic.target instead. */ + + r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + /* Second, if the rest of the base system is in the same + * transaction, order us after it, but do not pull it in or + * even require it. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + /* Third, add us in for normal shutdown. */ + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static void service_fix_stdio(Service *s) { + assert(s); + + /* Note that EXEC_INPUT_NULL and EXEC_OUTPUT_INHERIT play a special role here: they are both the + * default value that is subject to automatic overriding triggered by other settings and an explicit + * choice the user can make. We don't distinguish between these cases currently. */ + + if (s->exec_context.std_input == EXEC_INPUT_NULL && + s->exec_context.stdin_data_size > 0) + s->exec_context.std_input = EXEC_INPUT_DATA; + + if (IN_SET(s->exec_context.std_input, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL, + EXEC_INPUT_SOCKET, + EXEC_INPUT_NAMED_FD)) + return; + + /* We assume these listed inputs refer to bidirectional streams, and hence duplicating them from + * stdin to stdout/stderr makes sense and hence leaving EXEC_OUTPUT_INHERIT in place makes sense, + * too. Outputs such as regular files or sealed data memfds otoh don't really make sense to be + * duplicated for both input and output at the same time (since they then would cause a feedback + * loop), hence override EXEC_OUTPUT_INHERIT with the default stderr/stdout setting. */ + + if (s->exec_context.std_error == EXEC_OUTPUT_INHERIT && + s->exec_context.std_output == EXEC_OUTPUT_INHERIT) + s->exec_context.std_error = UNIT(s)->manager->defaults.std_error; + + if (s->exec_context.std_output == EXEC_OUTPUT_INHERIT) + s->exec_context.std_output = UNIT(s)->manager->defaults.std_output; +} + +static int service_setup_bus_name(Service *s) { + int r; + + assert(s); + + /* If s->bus_name is not set, then the unit will be refused by service_verify() later. */ + if (!s->bus_name) + return 0; + + if (s->type == SERVICE_DBUS) { + r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m"); + + /* We always want to be ordered against dbus.socket if both are in the transaction. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m"); + } + + r = unit_watch_bus_name(UNIT(s), s->bus_name); + if (r == -EEXIST) + return log_unit_error_errno(UNIT(s), r, "Two services allocated for the same bus name %s, refusing operation.", s->bus_name); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Cannot watch bus name %s: %m", s->bus_name); + + return 0; +} + +static int service_add_extras(Service *s) { + int r; + + assert(s); + + if (s->type == _SERVICE_TYPE_INVALID) { + /* Figure out a type automatically */ + if (s->bus_name) + s->type = SERVICE_DBUS; + else if (s->exec_command[SERVICE_EXEC_START]) + s->type = SERVICE_SIMPLE; + else + s->type = SERVICE_ONESHOT; + } + + /* Oneshot services have disabled start timeout by default */ + if (s->type == SERVICE_ONESHOT && !s->start_timeout_defined) + s->timeout_start_usec = USEC_INFINITY; + + service_fix_stdio(s); + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(UNIT(s), &s->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + /* If the service needs the notify socket, let's enable it automatically. */ + if (s->notify_access == NOTIFY_NONE && + (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) || s->watchdog_usec > 0 || s->n_fd_store_max > 0)) + s->notify_access = NOTIFY_MAIN; + + /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when + * delegation is on, in that case it we assume the payload knows better what to do and can process + * things in a more focused way. */ + if (s->oom_policy < 0) + s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->defaults.oom_policy; + + /* Let the kernel do the killing if that's requested. */ + s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL; + + r = service_add_default_dependencies(s); + if (r < 0) + return r; + + r = service_setup_bus_name(s); + if (r < 0) + return r; + + return 0; +} + +static int service_load(Unit *u) { + Service *s = SERVICE(u); + int r; + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = service_add_extras(s); + if (r < 0) + return r; + + return service_verify(s); +} + +static void service_dump_fdstore(Service *s, FILE *f, const char *prefix) { + assert(s); + assert(f); + assert(prefix); + + LIST_FOREACH(fd_store, i, s->fd_store) { + _cleanup_free_ char *path = NULL; + struct stat st; + int flags; + + if (fstat(i->fd, &st) < 0) { + log_debug_errno(errno, "Failed to stat fdstore entry: %m"); + continue; + } + + flags = fcntl(i->fd, F_GETFL); + if (flags < 0) { + log_debug_errno(errno, "Failed to get fdstore entry flags: %m"); + continue; + } + + (void) fd_get_path(i->fd, &path); + + fprintf(f, + "%s%s '%s' (type=%s; dev=" DEVNUM_FORMAT_STR "; inode=%" PRIu64 "; rdev=" DEVNUM_FORMAT_STR "; path=%s; access=%s)\n", + prefix, i == s->fd_store ? "File Descriptor Store Entry:" : " ", + i->fdname, + inode_type_to_string(st.st_mode), + DEVNUM_FORMAT_VAL(st.st_dev), + (uint64_t) st.st_ino, + DEVNUM_FORMAT_VAL(st.st_rdev), + strna(path), + accmode_to_string(flags)); + } +} + +static void service_dump(Unit *u, FILE *f, const char *prefix) { + Service *s = SERVICE(u); + const char *prefix2; + + assert(s); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%sService State: %s\n" + "%sResult: %s\n" + "%sReload Result: %s\n" + "%sClean Result: %s\n" + "%sPermissionsStartOnly: %s\n" + "%sRootDirectoryStartOnly: %s\n" + "%sRemainAfterExit: %s\n" + "%sGuessMainPID: %s\n" + "%sType: %s\n" + "%sRestart: %s\n" + "%sNotifyAccess: %s\n" + "%sNotifyState: %s\n" + "%sOOMPolicy: %s\n" + "%sReloadSignal: %s\n", + prefix, service_state_to_string(s->state), + prefix, service_result_to_string(s->result), + prefix, service_result_to_string(s->reload_result), + prefix, service_result_to_string(s->clean_result), + prefix, yes_no(s->permissions_start_only), + prefix, yes_no(s->root_directory_start_only), + prefix, yes_no(s->remain_after_exit), + prefix, yes_no(s->guess_main_pid), + prefix, service_type_to_string(s->type), + prefix, service_restart_to_string(s->restart), + prefix, notify_access_to_string(service_get_notify_access(s)), + prefix, notify_state_to_string(s->notify_state), + prefix, oom_policy_to_string(s->oom_policy), + prefix, signal_to_string(s->reload_signal)); + + if (pidref_is_set(&s->control_pid)) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid.pid); + + if (pidref_is_set(&s->main_pid)) + fprintf(f, + "%sMain PID: "PID_FMT"\n" + "%sMain PID Known: %s\n" + "%sMain PID Alien: %s\n", + prefix, s->main_pid.pid, + prefix, yes_no(s->main_pid_known), + prefix, yes_no(s->main_pid_alien)); + + if (s->pid_file) + fprintf(f, + "%sPIDFile: %s\n", + prefix, s->pid_file); + + if (s->bus_name) + fprintf(f, + "%sBusName: %s\n" + "%sBus Name Good: %s\n", + prefix, s->bus_name, + prefix, yes_no(s->bus_name_good)); + + if (UNIT_ISSET(s->accept_socket)) + fprintf(f, + "%sAccept Socket: %s\n", + prefix, UNIT_DEREF(s->accept_socket)->id); + + fprintf(f, + "%sRestartSec: %s\n" + "%sRestartSteps: %u\n" + "%sRestartMaxDelaySec: %s\n" + "%sTimeoutStartSec: %s\n" + "%sTimeoutStopSec: %s\n" + "%sTimeoutStartFailureMode: %s\n" + "%sTimeoutStopFailureMode: %s\n", + prefix, FORMAT_TIMESPAN(s->restart_usec, USEC_PER_SEC), + prefix, s->restart_steps, + prefix, FORMAT_TIMESPAN(s->restart_max_delay_usec, USEC_PER_SEC), + prefix, FORMAT_TIMESPAN(s->timeout_start_usec, USEC_PER_SEC), + prefix, FORMAT_TIMESPAN(s->timeout_stop_usec, USEC_PER_SEC), + prefix, service_timeout_failure_mode_to_string(s->timeout_start_failure_mode), + prefix, service_timeout_failure_mode_to_string(s->timeout_stop_failure_mode)); + + if (s->timeout_abort_set) + fprintf(f, + "%sTimeoutAbortSec: %s\n", + prefix, FORMAT_TIMESPAN(s->timeout_abort_usec, USEC_PER_SEC)); + + fprintf(f, + "%sRuntimeMaxSec: %s\n" + "%sRuntimeRandomizedExtraSec: %s\n" + "%sWatchdogSec: %s\n", + prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC), + prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC), + prefix, FORMAT_TIMESPAN(s->watchdog_usec, USEC_PER_SEC)); + + kill_context_dump(&s->kill_context, f, prefix); + exec_context_dump(&s->exec_context, f, prefix); + + for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) { + if (!s->exec_command[c]) + continue; + + fprintf(f, "%s-> %s:\n", + prefix, service_exec_command_to_string(c)); + + exec_command_dump_list(s->exec_command[c], f, prefix2); + } + + if (s->status_text) + fprintf(f, "%sStatus Text: %s\n", + prefix, s->status_text); + + if (s->n_fd_store_max > 0) + fprintf(f, + "%sFile Descriptor Store Max: %u\n" + "%sFile Descriptor Store Pin: %s\n" + "%sFile Descriptor Store Current: %zu\n", + prefix, s->n_fd_store_max, + prefix, exec_preserve_mode_to_string(s->fd_store_preserve_mode), + prefix, s->n_fd_store); + + service_dump_fdstore(s, f, prefix); + + if (s->open_files) + LIST_FOREACH(open_files, of, s->open_files) { + _cleanup_free_ char *ofs = NULL; + int r; + + r = open_file_to_string(of, &ofs); + if (r < 0) { + log_debug_errno(r, + "Failed to convert OpenFile= setting to string, ignoring: %m"); + continue; + } + + fprintf(f, "%sOpen File: %s\n", prefix, ofs); + } + + cgroup_context_dump(UNIT(s), f, prefix); +} + +static int service_is_suitable_main_pid(Service *s, PidRef *pid, int prio) { + Unit *owner; + int r; + + assert(s); + assert(pidref_is_set(pid)); + + /* Checks whether the specified PID is suitable as main PID for this service. returns negative if not, 0 if the + * PID is questionnable but should be accepted if the source of configuration is trusted. > 0 if the PID is + * good */ + + if (pidref_is_self(pid) || pid->pid == 1) + return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the manager, refusing.", pid->pid); + + if (pidref_equal(pid, &s->control_pid)) + return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the control process, refusing.", pid->pid); + + r = pidref_is_alive(pid); + if (r < 0) + return log_unit_full_errno(UNIT(s), prio, r, "Failed to check if main PID "PID_FMT" exists or is a zombie: %m", pid->pid); + if (r == 0) + return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(ESRCH), "New main PID "PID_FMT" does not exist or is a zombie.", pid->pid); + + owner = manager_get_unit_by_pidref(UNIT(s)->manager, pid); + if (owner == UNIT(s)) { + log_unit_debug(UNIT(s), "New main PID "PID_FMT" belongs to service, we are happy.", pid->pid); + return 1; /* Yay, it's definitely a good PID */ + } + + return 0; /* Hmm it's a suspicious PID, let's accept it if configuration source is trusted */ +} + +static int service_load_pid_file(Service *s, bool may_warn) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + bool questionable_pid_file = false; + _cleanup_free_ char *k = NULL; + _cleanup_close_ int fd = -EBADF; + int r, prio; + + assert(s); + + if (!s->pid_file) + return -ENOENT; + + prio = may_warn ? LOG_INFO : LOG_DEBUG; + + r = chase(s->pid_file, NULL, CHASE_SAFE, NULL, &fd); + if (r == -ENOLINK) { + log_unit_debug_errno(UNIT(s), r, + "Potentially unsafe symlink chain, will now retry with relaxed checks: %s", s->pid_file); + + questionable_pid_file = true; + + r = chase(s->pid_file, NULL, 0, NULL, &fd); + } + if (r < 0) + return log_unit_full_errno(UNIT(s), prio, r, + "Can't open PID file %s (yet?) after %s: %m", s->pid_file, service_state_to_string(s->state)); + + /* Let's read the PID file now that we chased it down. But we need to convert the O_PATH fd + * chase() returned us into a proper fd first. */ + r = read_one_line_file(FORMAT_PROC_FD_PATH(fd), &k); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, + "Can't convert PID files %s O_PATH file descriptor to proper file descriptor: %m", + s->pid_file); + + r = pidref_set_pidstr(&pidref, k); + if (r < 0) + return log_unit_full_errno(UNIT(s), prio, r, "Failed to parse PID from file %s: %m", s->pid_file); + + if (s->main_pid_known && pidref_equal(&pidref, &s->main_pid)) + return 0; + + r = service_is_suitable_main_pid(s, &pidref, prio); + if (r < 0) + return r; + if (r == 0) { + struct stat st; + + if (questionable_pid_file) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM), + "Refusing to accept PID outside of service control group, acquired through unsafe symlink chain: %s", s->pid_file); + + /* Hmm, it's not clear if the new main PID is safe. Let's allow this if the PID file is owned by root */ + + if (fstat(fd, &st) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to fstat() PID file O_PATH fd: %m"); + + if (st.st_uid != 0) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM), + "New main PID "PID_FMT" does not belong to service, and PID file is not owned by root. Refusing.", pidref.pid); + + log_unit_debug(UNIT(s), "New main PID "PID_FMT" does not belong to service, but we'll accept it since PID file is owned by root.", pidref.pid); + } + + if (s->main_pid_known) { + log_unit_debug(UNIT(s), "Main PID changing: "PID_FMT" -> "PID_FMT, s->main_pid.pid, pidref.pid); + + service_unwatch_main_pid(s); + s->main_pid_known = false; + } else + log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pidref.pid); + + r = service_set_main_pidref(s, &pidref); + if (r < 0) + return r; + + r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); + if (r < 0) /* FIXME: we need to do something here */ + return log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" for service: %m", s->main_pid.pid); + + return 1; +} + +static void service_search_main_pid(Service *s) { + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + int r; + + assert(s); + + /* If we know it anyway, don't ever fall back to unreliable heuristics */ + if (s->main_pid_known) + return; + + if (!s->guess_main_pid) + return; + + assert(!pidref_is_set(&s->main_pid)); + + if (unit_search_main_pid(UNIT(s), &pid) < 0) + return; + + log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid.pid); + if (service_set_main_pidref(s, &pid) < 0) + return; + + r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); + if (r < 0) + /* FIXME: we need to do something here */ + log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" from: %m", s->main_pid.pid); +} + +static void service_set_state(Service *s, ServiceState state) { + ServiceState old_state; + const UnitActiveState *table; + + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + + old_state = s->state; + s->state = state; + + service_unwatch_pid_file(s); + + if (!IN_SET(state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_AUTO_RESTART, + SERVICE_CLEANING)) + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + + if (!IN_SET(state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { + service_unwatch_main_pid(s); + s->main_command = NULL; + } + + if (!IN_SET(state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_CLEANING)) { + service_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + } + + if (IN_SET(state, + SERVICE_DEAD, SERVICE_FAILED, + SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED, + SERVICE_DEAD_RESOURCES_PINNED)) { + unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } + + if (state != SERVICE_START) + s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); + + if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) + service_stop_watchdog(s); + + /* For the inactive states unit_notify() will trim the cgroup, + * but for exit we have to do that ourselves... */ + if (state == SERVICE_EXITED && !MANAGER_IS_RELOADING(UNIT(s)->manager)) + unit_prune_cgroup(UNIT(s)); + + if (old_state != state) + log_unit_debug(UNIT(s), "Changed %s -> %s", service_state_to_string(old_state), service_state_to_string(state)); + + unit_notify(UNIT(s), table[old_state], table[state], s->reload_result == SERVICE_SUCCESS); +} + +static usec_t service_coldplug_timeout(Service *s) { + assert(s); + + switch (s->deserialized_state) { + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_RELOAD: + case SERVICE_RELOAD_SIGNAL: + case SERVICE_RELOAD_NOTIFY: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_start_usec); + + case SERVICE_RUNNING: + return service_running_timeout(s); + + case SERVICE_STOP: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec); + + case SERVICE_STOP_WATCHDOG: + case SERVICE_FINAL_WATCHDOG: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s)); + + case SERVICE_AUTO_RESTART: + return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, service_restart_usec_next(s)); + + case SERVICE_CLEANING: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->exec_context.timeout_clean_usec); + + default: + return USEC_INFINITY; + } +} + +static int service_coldplug(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + assert(s->state == SERVICE_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + r = service_arm_timer(s, /* relative= */ false, service_coldplug_timeout(s)); + if (r < 0) + return r; + + if (pidref_is_set(&s->main_pid) && + pidref_is_unwaited(&s->main_pid) > 0 && + (IN_SET(s->deserialized_state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) { + r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); + if (r < 0) + return r; + } + + if (pidref_is_set(&s->control_pid) && + pidref_is_unwaited(&s->control_pid) > 0 && + IN_SET(s->deserialized_state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_CLEANING)) { + r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, + SERVICE_DEAD, SERVICE_FAILED, + SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED, + SERVICE_CLEANING, + SERVICE_DEAD_RESOURCES_PINNED)) { + (void) unit_enqueue_rewatch_pids(u); + (void) unit_setup_exec_runtime(u); + } + + if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) + service_start_watchdog(s); + + if (UNIT_ISSET(s->accept_socket)) { + Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket)); + + if (socket->max_connections_per_source > 0) { + SocketPeer *peer; + + /* Make a best-effort attempt at bumping the connection count */ + if (socket_acquire_peer(socket, s->socket_fd, &peer) > 0) { + socket_peer_unref(s->socket_peer); + s->socket_peer = peer; + } + } + } + + service_set_state(s, s->deserialized_state); + return 0; +} + +static int service_collect_fds( + Service *s, + int **fds, + char ***fd_names, + size_t *n_socket_fds, + size_t *n_storage_fds) { + + _cleanup_strv_free_ char **rfd_names = NULL; + _cleanup_free_ int *rfds = NULL; + size_t rn_socket_fds = 0, rn_storage_fds = 0; + int r; + + assert(s); + assert(fds); + assert(fd_names); + assert(n_socket_fds); + assert(n_storage_fds); + + if (s->socket_fd >= 0) { + + /* Pass the per-connection socket */ + + rfds = newdup(int, &s->socket_fd, 1); + if (!rfds) + return -ENOMEM; + + rfd_names = strv_new("connection"); + if (!rfd_names) + return -ENOMEM; + + rn_socket_fds = 1; + } else { + Unit *u; + + /* Pass all our configured sockets for singleton services */ + + UNIT_FOREACH_DEPENDENCY(u, UNIT(s), UNIT_ATOM_TRIGGERED_BY) { + _cleanup_free_ int *cfds = NULL; + Socket *sock; + int cn_fds; + + if (u->type != UNIT_SOCKET) + continue; + + sock = SOCKET(u); + + cn_fds = socket_collect_fds(sock, &cfds); + if (cn_fds < 0) + return cn_fds; + + if (cn_fds <= 0) + continue; + + if (!rfds) { + rfds = TAKE_PTR(cfds); + rn_socket_fds = cn_fds; + } else { + int *t; + + t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int)); + if (!t) + return -ENOMEM; + + memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int)); + + rfds = t; + rn_socket_fds += cn_fds; + } + + r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds); + if (r < 0) + return r; + } + } + + if (s->n_fd_store > 0) { + size_t n_fds; + char **nl; + int *t; + + t = reallocarray(rfds, rn_socket_fds + s->n_fd_store, sizeof(int)); + if (!t) + return -ENOMEM; + + rfds = t; + + nl = reallocarray(rfd_names, rn_socket_fds + s->n_fd_store + 1, sizeof(char *)); + if (!nl) + return -ENOMEM; + + rfd_names = nl; + n_fds = rn_socket_fds; + + LIST_FOREACH(fd_store, fs, s->fd_store) { + rfds[n_fds] = fs->fd; + rfd_names[n_fds] = strdup(strempty(fs->fdname)); + if (!rfd_names[n_fds]) + return -ENOMEM; + + rn_storage_fds++; + n_fds++; + } + + rfd_names[n_fds] = NULL; + } + + *fds = TAKE_PTR(rfds); + *fd_names = TAKE_PTR(rfd_names); + *n_socket_fds = rn_socket_fds; + *n_storage_fds = rn_storage_fds; + + return 0; +} + +static int service_allocate_exec_fd_event_source( + Service *s, + int fd, + sd_event_source **ret_event_source) { + + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + assert(fd >= 0); + assert(ret_event_source); + + r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m"); + + /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */ + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m"); + + (void) sd_event_source_set_description(source, "service exec_fd"); + + r = sd_event_source_set_io_fd_own(source, true); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m"); + + *ret_event_source = TAKE_PTR(source); + return 0; +} + +static int service_allocate_exec_fd( + Service *s, + sd_event_source **ret_event_source, + int *ret_exec_fd) { + + _cleanup_close_pair_ int p[] = EBADF_PAIR; + int r; + + assert(s); + assert(ret_event_source); + assert(ret_exec_fd); + + if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m"); + + r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source); + if (r < 0) + return r; + + TAKE_FD(p[0]); + *ret_exec_fd = TAKE_FD(p[1]); + + return 0; +} + +static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) { + assert(s); + + /* Notifications are accepted depending on the process and + * the access setting of the service: + * process: \ access: NONE MAIN EXEC ALL + * main no yes yes yes + * control no no yes yes + * other (forked) no no no yes */ + + if (flags & EXEC_IS_CONTROL) + /* A control process */ + return IN_SET(service_get_notify_access(s), NOTIFY_EXEC, NOTIFY_ALL); + + /* We only spawn main processes and control processes, so any + * process that is not a control process is a main process */ + return service_get_notify_access(s) != NOTIFY_NONE; +} + +static Service *service_get_triggering_service(Service *s) { + Unit *candidate = NULL, *other; + + assert(s); + + /* Return the service which triggered service 's', this means dependency + * types which include the UNIT_ATOM_ON_{FAILURE,SUCCESS}_OF atoms. + * + * N.B. if there are multiple services which could trigger 's' via OnFailure= + * or OnSuccess= then we return NULL. This is since we don't know from which + * one to propagate the exit status. */ + + UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_FAILURE_OF) { + if (candidate) + goto have_other; + candidate = other; + } + + UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_SUCCESS_OF) { + if (candidate) + goto have_other; + candidate = other; + } + + return SERVICE(candidate); + + have_other: + log_unit_warning(UNIT(s), "multiple trigger source candidates for exit status propagation (%s, %s), skipping.", + candidate->id, other->id); + return NULL; +} + +static int service_spawn_internal( + const char *caller, + Service *s, + ExecCommand *c, + usec_t timeout, + ExecFlags flags, + PidRef *ret_pid) { + + _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags); + _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL; + _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL; + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + size_t n_env = 0; + pid_t pid; + int r; + + assert(caller); + assert(s); + assert(c); + assert(ret_pid); + + log_unit_debug(UNIT(s), "Will spawn child (%s): %s", caller, c->path); + + r = unit_prepare_exec(UNIT(s)); /* This realizes the cgroup, among other things */ + if (r < 0) + return r; + + assert(!s->exec_fd_event_source); + + if (flags & EXEC_IS_CONTROL) { + /* If this is a control process, mask the permissions/chroot application if this is requested. */ + if (s->permissions_start_only) + exec_params.flags &= ~EXEC_APPLY_SANDBOXING; + if (s->root_directory_start_only) + exec_params.flags &= ~EXEC_APPLY_CHROOT; + } + + if ((flags & EXEC_PASS_FDS) || + s->exec_context.std_input == EXEC_INPUT_SOCKET || + s->exec_context.std_output == EXEC_OUTPUT_SOCKET || + s->exec_context.std_error == EXEC_OUTPUT_SOCKET) { + + r = service_collect_fds(s, + &exec_params.fds, + &exec_params.fd_names, + &exec_params.n_socket_fds, + &exec_params.n_storage_fds); + if (r < 0) + return r; + + exec_params.open_files = s->open_files; + + log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds); + } + + if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd); + if (r < 0) + return r; + } + + r = service_arm_timer(s, /* relative= */ true, timeout); + if (r < 0) + return r; + + our_env = new0(char*, 13); + if (!our_env) + return -ENOMEM; + + if (service_exec_needs_notify_socket(s, flags)) { + if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0) + return -ENOMEM; + + exec_params.notify_socket = UNIT(s)->manager->notify_socket; + + if (s->n_fd_store_max > 0) + if (asprintf(our_env + n_env++, "FDSTORE=%u", s->n_fd_store_max) < 0) + return -ENOMEM; + } + + if (pidref_is_set(&s->main_pid)) + if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid.pid) < 0) + return -ENOMEM; + + if (MANAGER_IS_USER(UNIT(s)->manager)) + if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + + if (s->pid_file) + if (asprintf(our_env + n_env++, "PIDFILE=%s", s->pid_file) < 0) + return -ENOMEM; + + if (s->socket_fd >= 0) { + union sockaddr_union sa; + socklen_t salen = sizeof(sa); + + /* If this is a per-connection service instance, let's set $REMOTE_ADDR and $REMOTE_PORT to something + * useful. Note that we do this only when we are still connected at this point in time, which we might + * very well not be. Hence we ignore all errors when retrieving peer information (as that might result + * in ENOTCONN), and just use whate we can use. */ + + if (getpeername(s->socket_fd, &sa.sa, &salen) >= 0 && + IN_SET(sa.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + _cleanup_free_ char *addr = NULL; + char *t; + unsigned port; + + r = sockaddr_pretty(&sa.sa, salen, true, false, &addr); + if (r < 0) + return r; + + t = strjoin("REMOTE_ADDR=", addr); + if (!t) + return -ENOMEM; + our_env[n_env++] = t; + + r = sockaddr_port(&sa.sa, &port); + if (r < 0) + return r; + + if (asprintf(&t, "REMOTE_PORT=%u", port) < 0) + return -ENOMEM; + our_env[n_env++] = t; + } + } + + Service *env_source = NULL; + const char *monitor_prefix; + if (flags & EXEC_SETENV_RESULT) { + env_source = s; + monitor_prefix = ""; + } else if (flags & EXEC_SETENV_MONITOR_RESULT) { + env_source = service_get_triggering_service(s); + monitor_prefix = "MONITOR_"; + } + + if (env_source) { + if (asprintf(our_env + n_env++, "%sSERVICE_RESULT=%s", monitor_prefix, service_result_to_string(env_source->result)) < 0) + return -ENOMEM; + + if (env_source->main_exec_status.pid > 0 && + dual_timestamp_is_set(&env_source->main_exec_status.exit_timestamp)) { + if (asprintf(our_env + n_env++, "%sEXIT_CODE=%s", monitor_prefix, sigchld_code_to_string(env_source->main_exec_status.code)) < 0) + return -ENOMEM; + + if (env_source->main_exec_status.code == CLD_EXITED) + r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%i", monitor_prefix, env_source->main_exec_status.status); + else + r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%s", monitor_prefix, signal_to_string(env_source->main_exec_status.status)); + + if (r < 0) + return -ENOMEM; + } + + if (env_source != s) { + if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) { + r = asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR, + monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id)); + if (r < 0) + return -ENOMEM; + } + + if (asprintf(our_env + n_env++, "%sUNIT=%s", monitor_prefix, UNIT(env_source)->id) < 0) + return -ENOMEM; + } + } + + if (UNIT(s)->activation_details) { + r = activation_details_append_env(UNIT(s)->activation_details, &our_env); + if (r < 0) + return r; + /* The number of env vars added here can vary, rather than keeping the allocation block in + * sync manually, these functions simply use the strv methods to append to it, so we need + * to update n_env when we are done in case of future usage. */ + n_env += r; + } + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + final_env = strv_env_merge(exec_params.environment, our_env); + if (!final_env) + return -ENOMEM; + + /* System D-Bus needs nss-systemd disabled, so that we don't deadlock */ + SET_FLAG(exec_params.flags, EXEC_NSS_DYNAMIC_BYPASS, + MANAGER_IS_SYSTEM(UNIT(s)->manager) && unit_has_name(UNIT(s), SPECIAL_DBUS_SERVICE)); + + strv_free_and_replace(exec_params.environment, final_env); + exec_params.watchdog_usec = service_get_watchdog_usec(s); + exec_params.selinux_context_net = s->socket_fd_selinux_context_net; + if (s->type == SERVICE_IDLE) + exec_params.idle_pipe = UNIT(s)->manager->idle_pipe; + exec_params.stdin_fd = s->stdin_fd; + exec_params.stdout_fd = s->stdout_fd; + exec_params.stderr_fd = s->stderr_fd; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->cgroup_context, + &pid); + if (r < 0) + return r; + + s->exec_fd_event_source = TAKE_PTR(exec_fd_source); + s->exec_fd_hot = false; + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return r; + + r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true); + if (r < 0) + return r; + + *ret_pid = TAKE_PIDREF(pidref); + return 0; +} + +static int main_pid_good(Service *s) { + assert(s); + + /* Returns 0 if the pid is dead, > 0 if it is good, < 0 if we don't know */ + + /* If we know the pid file, then let's just check if it is still valid */ + if (s->main_pid_known) { + + /* If it's an alien child let's check if it is still alive ... */ + if (s->main_pid_alien && pidref_is_set(&s->main_pid)) + return pidref_is_alive(&s->main_pid); + + /* .. otherwise assume we'll get a SIGCHLD for it, which we really should wait for to collect + * exit status and code */ + return pidref_is_set(&s->main_pid); + } + + /* We don't know the pid */ + return -EAGAIN; +} + +static int control_pid_good(Service *s) { + assert(s); + + /* Returns 0 if the control PID is dead, > 0 if it is good. We never actually return < 0 here, but in order to + * make this function as similar as possible to main_pid_good() and cgroup_good(), we pretend that < 0 also + * means: we can't figure it out. */ + + return pidref_is_set(&s->control_pid); +} + +static int cgroup_good(Service *s) { + int r; + + assert(s); + + /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't + * figure it out */ + + if (!UNIT(s)->cgroup_path) + return 0; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path); + if (r < 0) + return r; + + return r == 0; +} + +static bool service_shall_restart(Service *s, const char **reason) { + assert(s); + + /* Don't restart after manual stops */ + if (s->forbid_restart) { + *reason = "manual stop"; + return false; + } + + /* Never restart if this is configured as special exception */ + if (exit_status_set_test(&s->restart_prevent_status, s->main_exec_status.code, s->main_exec_status.status)) { + *reason = "prevented by exit status"; + return false; + } + + /* Restart if the exit code/status are configured as restart triggers */ + if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) { + *reason = "forced by exit status"; + return true; + } + + *reason = "restart setting"; + switch (s->restart) { + + case SERVICE_RESTART_NO: + return false; + + case SERVICE_RESTART_ALWAYS: + return s->result != SERVICE_SKIP_CONDITION; + + case SERVICE_RESTART_ON_SUCCESS: + return s->result == SERVICE_SUCCESS; + + case SERVICE_RESTART_ON_FAILURE: + return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_SKIP_CONDITION); + + case SERVICE_RESTART_ON_ABNORMAL: + return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_FAILURE_EXIT_CODE, SERVICE_SKIP_CONDITION); + + case SERVICE_RESTART_ON_WATCHDOG: + return s->result == SERVICE_FAILURE_WATCHDOG; + + case SERVICE_RESTART_ON_ABORT: + return IN_SET(s->result, SERVICE_FAILURE_SIGNAL, SERVICE_FAILURE_CORE_DUMP); + + default: + assert_not_reached(); + } +} + +static bool service_will_restart(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (IN_SET(s->state, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED)) + return true; + + return unit_will_restart_default(u); +} + +static ServiceState service_determine_dead_state(Service *s) { + assert(s); + + return s->fd_store && s->fd_store_preserve_mode == EXEC_PRESERVE_YES ? SERVICE_DEAD_RESOURCES_PINNED : SERVICE_DEAD; +} + +static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) { + ServiceState end_state, restart_state; + int r; + + assert(s); + + /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not + * undo what has already been enqueued. */ + if (unit_stop_pending(UNIT(s))) + allow_restart = false; + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->result == SERVICE_SUCCESS) { + unit_log_success(UNIT(s)); + end_state = service_determine_dead_state(s); + restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART; + } else if (s->result == SERVICE_SKIP_CONDITION) { + unit_log_skip(UNIT(s), service_result_to_string(s->result)); + end_state = service_determine_dead_state(s); + restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART; + } else { + unit_log_failure(UNIT(s), service_result_to_string(s->result)); + end_state = SERVICE_FAILED; + restart_state = SERVICE_FAILED_BEFORE_AUTO_RESTART; + } + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop); + + if (!allow_restart) + log_unit_debug(UNIT(s), "Service restart not allowed."); + else { + const char *reason; + + allow_restart = service_shall_restart(s, &reason); + log_unit_debug(UNIT(s), "Service will %srestart (%s)", + allow_restart ? "" : "not ", + reason); + } + + if (allow_restart) { + usec_t restart_usec_next; + + /* We make two state changes here: one that maps to the high-level UNIT_INACTIVE/UNIT_FAILED + * state (i.e. a state indicating deactivation), and then one that that maps to the + * high-level UNIT_STARTING state (i.e. a state indicating activation). We do this so that + * external software can watch the state changes and see all service failures, even if they + * are only transitionary and followed by an automatic restart. We have fine-grained + * low-level states for this though so that software can distinguish the permanent UNIT_INACTIVE + * state from this transitionary UNIT_INACTIVE state by looking at the low-level states. */ + if (s->restart_mode != SERVICE_RESTART_MODE_DIRECT) + service_set_state(s, restart_state); + + restart_usec_next = service_restart_usec_next(s); + + r = service_arm_timer(s, /* relative= */ true, restart_usec_next); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install restart timer: %m"); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false); + return; + } + + log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0)); + + service_set_state(s, SERVICE_AUTO_RESTART); + } else { + service_set_state(s, end_state); + + /* If we shan't restart, then flush out the restart counter. But don't do that immediately, so that the + * user can still introspect the counter. Do so on the next start. */ + s->flush_n_restarts = true; + } + + /* The new state is in effect, let's decrease the fd store ref counter again. Let's also re-add us to the GC + * queue, so that the fd store is possibly gc'ed again */ + unit_add_to_gc_queue(UNIT(s)); + + /* The next restart might not be a manual stop, hence reset the flag indicating manual stops */ + s->forbid_restart = false; + + /* Reset NotifyAccess override */ + s->notify_access_override = _NOTIFY_ACCESS_INVALID; + + /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */ + s->exec_runtime = exec_runtime_destroy(s->exec_runtime); + + /* Also, remove the runtime directory */ + unit_destroy_runtime_data(UNIT(s), &s->exec_context); + + /* Also get rid of the fd store, if that's configured. */ + if (s->fd_store_preserve_mode == EXEC_PRESERVE_NO) + service_release_fd_store(s); + + /* Get rid of the IPC bits of the user */ + unit_unref_uid_gid(UNIT(s), true); + + /* Try to delete the pid file. At this point it will be + * out-of-date, and some software might be confused by it, so + * let's remove it. */ + if (s->pid_file) + (void) unlink(s->pid_file); + + /* Reset TTY ownership if necessary */ + exec_context_revert_tty(&s->exec_context); +} + +static void service_enter_stop_post(Service *s, ServiceResult f) { + int r; + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_STOP_POST; + pidref_done(&s->control_pid); + + r = service_spawn(s, + s->control_command, + s->timeout_stop_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m"); + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES); + return; + } + + service_set_state(s, SERVICE_STOP_POST); + } else + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_SUCCESS); +} + +static int state_to_kill_operation(Service *s, ServiceState state) { + switch (state) { + + case SERVICE_STOP_WATCHDOG: + case SERVICE_FINAL_WATCHDOG: + return KILL_WATCHDOG; + + case SERVICE_STOP_SIGTERM: + if (unit_has_job_type(UNIT(s), JOB_RESTART)) + return KILL_RESTART; + _fallthrough_; + + case SERVICE_FINAL_SIGTERM: + return KILL_TERMINATE; + + case SERVICE_STOP_SIGKILL: + case SERVICE_FINAL_SIGKILL: + return KILL_KILL; + + default: + return _KILL_OPERATION_INVALID; + } +} + +static void service_enter_signal(Service *s, ServiceState state, ServiceResult f) { + int kill_operation, r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + kill_operation = state_to_kill_operation(s, state); + r = unit_kill_context( + UNIT(s), + &s->kill_context, + kill_operation, + &s->main_pid, + &s->control_pid, + s->main_pid_alien); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + goto fail; + } + + if (r > 0) { + r = service_arm_timer(s, /* relative= */ true, + kill_operation == KILL_WATCHDOG ? service_timeout_abort_usec(s) : s->timeout_stop_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + goto fail; + } + + service_set_state(s, state); + } else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM) && s->kill_context.send_sigkill) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS); + else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL)) + service_enter_stop_post(s, SERVICE_SUCCESS); + else if (IN_SET(state, SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM) && s->kill_context.send_sigkill) + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS); + else + service_enter_dead(s, SERVICE_SUCCESS, /* allow_restart= */ true); + + return; + +fail: + if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL)) + service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES); + else + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true); +} + +static void service_enter_stop_by_notify(Service *s) { + int r; + + assert(s); + + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + r = service_arm_timer(s, /* relative= */ true, s->timeout_stop_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + return; + } + + /* The service told us it's stopping, so it's as if we SIGTERM'd it. */ + service_set_state(s, SERVICE_STOP_SIGTERM); +} + +static void service_enter_stop(Service *s, ServiceResult f) { + int r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + s->control_command = s->exec_command[SERVICE_EXEC_STOP]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_STOP; + pidref_done(&s->control_pid); + + r = service_spawn(s, + s->control_command, + s->timeout_stop_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop' task: %m"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + return; + } + + service_set_state(s, SERVICE_STOP); + } else + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS); +} + +static bool service_good(Service *s) { + int main_pid_ok; + assert(s); + + if (s->type == SERVICE_DBUS && !s->bus_name_good) + return false; + + main_pid_ok = main_pid_good(s); + if (main_pid_ok > 0) /* It's alive */ + return true; + if (main_pid_ok == 0 && s->exit_type == SERVICE_EXIT_MAIN) /* It's dead */ + return false; + + /* OK, we don't know anything about the main PID, maybe + * because there is none. Let's check the control group + * instead. */ + + return cgroup_good(s) != 0; +} + +static void service_enter_running(Service *s, ServiceResult f) { + int r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + + if (s->result != SERVICE_SUCCESS) + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + else if (service_good(s)) { + + /* If there are any queued up sd_notify() notifications, process them now */ + if (s->notify_state == NOTIFY_RELOADING) + service_enter_reload_by_notify(s); + else if (s->notify_state == NOTIFY_STOPPING) + service_enter_stop_by_notify(s); + else { + service_set_state(s, SERVICE_RUNNING); + + r = service_arm_timer(s, /* relative= */ false, service_running_timeout(s)); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + service_enter_running(s, SERVICE_FAILURE_RESOURCES); + return; + } + } + + } else if (s->remain_after_exit) + service_set_state(s, SERVICE_EXITED); + else + service_enter_stop(s, SERVICE_SUCCESS); +} + +static void service_enter_start_post(Service *s) { + int r; + assert(s); + + service_unwatch_control_pid(s); + service_reset_watchdog(s); + + s->control_command = s->exec_command[SERVICE_EXEC_START_POST]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_START_POST; + pidref_done(&s->control_pid); + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m"); + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); + return; + } + + service_set_state(s, SERVICE_START_POST); + } else + service_enter_running(s, SERVICE_SUCCESS); +} + +static void service_kill_control_process(Service *s) { + int r; + + assert(s); + + if (!pidref_is_set(&s->control_pid)) + return; + + r = pidref_kill_and_sigcont(&s->control_pid, SIGKILL); + if (r < 0) { + _cleanup_free_ char *comm = NULL; + + (void) pidref_get_comm(&s->control_pid, &comm); + + log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", + s->control_pid.pid, strna(comm)); + } +} + +static int service_adverse_to_leftover_processes(Service *s) { + assert(s); + + /* KillMode=mixed and control group are used to indicate that all process should be killed off. + * SendSIGKILL= is used for services that require a clean shutdown. These are typically database + * service where a SigKilled process would result in a lengthy recovery and who's shutdown or startup + * time is quite variable (so Timeout settings aren't of use). + * + * Here we take these two factors and refuse to start a service if there are existing processes + * within a control group. Databases, while generally having some protection against multiple + * instances running, lets not stress the rigor of these. Also ExecStartPre= parts of the service + * aren't as rigoriously written to protect aganst against multiple use. */ + + if (unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start) > 0 && + IN_SET(s->kill_context.kill_mode, KILL_MIXED, KILL_CONTROL_GROUP) && + !s->kill_context.send_sigkill) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EBUSY), + "Will not start SendSIGKILL=no service of type KillMode=control-group or mixed while processes exist"); + + return 0; +} + +static void service_enter_start(Service *s) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + ExecCommand *c; + usec_t timeout; + int r; + + assert(s); + + service_unwatch_control_pid(s); + service_unwatch_main_pid(s); + + r = service_adverse_to_leftover_processes(s); + if (r < 0) + goto fail; + + if (s->type == SERVICE_FORKING) { + s->control_command_id = SERVICE_EXEC_START; + c = s->control_command = s->exec_command[SERVICE_EXEC_START]; + + s->main_command = NULL; + } else { + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + s->control_command = NULL; + + c = s->main_command = s->exec_command[SERVICE_EXEC_START]; + } + + if (!c) { + if (s->type != SERVICE_ONESHOT) { + /* There's no command line configured for the main command? Hmm, that is strange. + * This can only happen if the configuration changes at runtime. In this case, + * let's enter a failure state. */ + r = log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENXIO), "There's no 'start' task anymore we could start."); + goto fail; + } + + /* We force a fake state transition here. Otherwise, the unit would go directly from + * SERVICE_DEAD to SERVICE_DEAD without SERVICE_ACTIVATING or SERVICE_ACTIVE + * in between. This way we can later trigger actions that depend on the state + * transition, including SuccessAction=. */ + service_set_state(s, SERVICE_START); + + service_enter_start_post(s); + return; + } + + if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) + /* For simple + idle this is the main process. We don't apply any timeout here, but + * service_enter_running() will later apply the .runtime_max_usec timeout. */ + timeout = USEC_INFINITY; + else + timeout = s->timeout_start_usec; + + r = service_spawn(s, + c, + timeout, + EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS|EXEC_SETENV_MONITOR_RESULT, + &pidref); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start' task: %m"); + goto fail; + } + + if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) { + /* For simple services we immediately start + * the START_POST binaries. */ + + (void) service_set_main_pidref(s, &pidref); + service_enter_start_post(s); + + } else if (s->type == SERVICE_FORKING) { + + /* For forking services we wait until the start + * process exited. */ + + pidref_done(&s->control_pid); + s->control_pid = TAKE_PIDREF(pidref); + service_set_state(s, SERVICE_START); + + } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC)) { + + /* For oneshot services we wait until the start process exited, too, but it is our main process. */ + + /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the + * bus. 'notify' and 'exec' services are similar. */ + + (void) service_set_main_pidref(s, &pidref); + service_set_state(s, SERVICE_START); + } else + assert_not_reached(); + + return; + +fail: + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static void service_enter_start_pre(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + + s->control_command = s->exec_command[SERVICE_EXEC_START_PRE]; + if (s->control_command) { + + r = service_adverse_to_leftover_processes(s); + if (r < 0) + goto fail; + + s->control_command_id = SERVICE_EXEC_START_PRE; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS, + &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m"); + goto fail; + } + + service_set_state(s, SERVICE_START_PRE); + } else + service_enter_start(s); + + return; + +fail: + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true); +} + +static void service_enter_condition(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + + s->control_command = s->exec_command[SERVICE_EXEC_CONDITION]; + if (s->control_command) { + + r = service_adverse_to_leftover_processes(s); + if (r < 0) + goto fail; + + s->control_command_id = SERVICE_EXEC_CONDITION; + pidref_done(&s->control_pid); + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN, + &s->control_pid); + + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'exec-condition' task: %m"); + goto fail; + } + + service_set_state(s, SERVICE_CONDITION); + } else + service_enter_start_pre(s); + + return; + +fail: + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true); +} + +static void service_enter_restart(Service *s) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + if (unit_has_job_type(UNIT(s), JOB_STOP)) { + /* Don't restart things if we are going down anyway */ + log_unit_info(UNIT(s), "Stop job pending for unit, skipping automatic restart."); + return; + } + + /* Any units that are bound to this service must also be restarted. We use JOB_START for ourselves + * but then set JOB_RESTART_DEPENDENCIES which will enqueue JOB_RESTART for those dependency jobs. */ + r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT(s), JOB_RESTART_DEPENDENCIES, NULL, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(s), "Failed to schedule restart job: %s", bus_error_message(&error, r)); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false); + return; + } + + /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't + * fully stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset + * the counter explicitly however via the usual "systemctl reset-failure" logic. */ + s->n_restarts ++; + s->flush_n_restarts = false; + + s->notify_access_override = _NOTIFY_ACCESS_INVALID; + + log_unit_struct(UNIT(s), LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR, + LOG_UNIT_INVOCATION_ID(UNIT(s)), + LOG_UNIT_MESSAGE(UNIT(s), + "Scheduled restart job, restart counter is at %u.", s->n_restarts), + "N_RESTARTS=%u", s->n_restarts); + + service_set_state(s, SERVICE_AUTO_RESTART_QUEUED); + + /* Notify clients about changed restart counter */ + unit_add_to_dbus_queue(UNIT(s)); +} + +static void service_enter_reload_by_notify(Service *s) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + r = service_arm_timer(s, /* relative= */ true, s->timeout_start_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); + return; + } + + service_set_state(s, SERVICE_RELOAD_NOTIFY); + + /* service_enter_reload_by_notify is never called during a reload, thus no loops are possible. */ + r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error); + if (r < 0) + log_unit_warning(UNIT(s), "Failed to schedule propagation of reload, ignoring: %s", bus_error_message(&error, r)); +} + +static void service_enter_reload(Service *s) { + bool killed = false; + int r; + + assert(s); + + service_unwatch_control_pid(s); + s->reload_result = SERVICE_SUCCESS; + + usec_t ts = now(CLOCK_MONOTONIC); + + if (s->type == SERVICE_NOTIFY_RELOAD && pidref_is_set(&s->main_pid)) { + r = pidref_kill_and_sigcont(&s->main_pid, s->reload_signal); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to send reload signal: %m"); + goto fail; + } + + killed = true; + } + + s->control_command = s->exec_command[SERVICE_EXEC_RELOAD]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_RELOAD; + pidref_done(&s->control_pid); + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'reload' task: %m"); + goto fail; + } + + service_set_state(s, SERVICE_RELOAD); + } else if (killed) { + r = service_arm_timer(s, /* relative= */ true, s->timeout_start_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + goto fail; + } + + service_set_state(s, SERVICE_RELOAD_SIGNAL); + } else { + service_enter_running(s, SERVICE_SUCCESS); + return; + } + + /* Store the timestamp when we started reloading: when reloading via SIGHUP we won't leave the reload + * state until we received both RELOADING=1 and READY=1 with MONOTONIC_USEC= set to a value above + * this. Thus we know for sure the reload cycle was executed *after* we requested it, and is not one + * that was already in progress before. */ + s->reload_begin_usec = ts; + return; + +fail: + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); +} + +static void service_run_next_control(Service *s) { + usec_t timeout; + int r; + + assert(s); + assert(s->control_command); + assert(s->control_command->command_next); + + assert(s->control_command_id != SERVICE_EXEC_START); + + s->control_command = s->control_command->command_next; + service_unwatch_control_pid(s); + + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + timeout = s->timeout_start_usec; + else + timeout = s->timeout_stop_usec; + + pidref_done(&s->control_pid); + + r = service_spawn(s, + s->control_command, + timeout, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL| + (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD) ? EXEC_WRITE_CREDENTIALS : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START) ? EXEC_SETENV_MONITOR_RESULT : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0), + &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn next control task: %m"); + + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START_POST, SERVICE_STOP)) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + else if (s->state == SERVICE_STOP_POST) + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true); + else if (s->state == SERVICE_RELOAD) { + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); + } else + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); + } +} + +static void service_run_next_main(Service *s) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + int r; + + assert(s); + assert(s->main_command); + assert(s->main_command->command_next); + assert(s->type == SERVICE_ONESHOT); + + s->main_command = s->main_command->command_next; + service_unwatch_main_pid(s); + + r = service_spawn(s, + s->main_command, + s->timeout_start_usec, + EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS, + &pidref); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn next main task: %m"); + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); + return; + } + + (void) service_set_main_pidref(s, &pidref); +} + +static int service_start(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(s->state, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, SERVICE_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST)) + return 0; + + /* A service that will be restarted must be stopped first to trigger BindsTo and/or OnFailure + * dependencies. If a user does not want to wait for the holdoff time to elapse, the service should + * be manually restarted, not started. We simply return EAGAIN here, so that any start jobs stay + * queued, and assume that the auto restart timer will eventually trigger the restart. */ + if (IN_SET(s->state, SERVICE_AUTO_RESTART, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART)) + return -EAGAIN; + + assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED, SERVICE_AUTO_RESTART_QUEUED)); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + s->result = SERVICE_SUCCESS; + s->reload_result = SERVICE_SUCCESS; + s->main_pid_known = false; + s->main_pid_alien = false; + s->forbid_restart = false; + + s->status_text = mfree(s->status_text); + s->status_errno = 0; + + s->notify_access_override = _NOTIFY_ACCESS_INVALID; + s->notify_state = NOTIFY_UNKNOWN; + + s->watchdog_original_usec = s->watchdog_usec; + s->watchdog_override_enable = false; + s->watchdog_override_usec = USEC_INFINITY; + + exec_command_reset_status_list_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); + exec_status_reset(&s->main_exec_status); + + /* This is not an automatic restart? Flush the restart counter then */ + if (s->flush_n_restarts) { + s->n_restarts = 0; + s->flush_n_restarts = false; + } + + u->reset_accounting = true; + + service_enter_condition(s); + return 1; +} + +static int service_stop(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* Don't create restart jobs from manual stops. */ + s->forbid_restart = true; + + switch (s->state) { + + case SERVICE_STOP: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + case SERVICE_STOP_POST: + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + /* Already on it */ + return 0; + + case SERVICE_AUTO_RESTART: + case SERVICE_AUTO_RESTART_QUEUED: + /* Give up on the auto restart */ + service_set_state(s, service_determine_dead_state(s)); + return 0; + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_RELOAD: + case SERVICE_RELOAD_SIGNAL: + case SERVICE_RELOAD_NOTIFY: + case SERVICE_STOP_WATCHDOG: + /* If there's already something running we go directly into kill mode. */ + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS); + return 0; + + case SERVICE_CLEANING: + /* If we are currently cleaning, then abort it, brutally. */ + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS); + return 0; + + case SERVICE_RUNNING: + case SERVICE_EXITED: + service_enter_stop(s, SERVICE_SUCCESS); + return 1; + + case SERVICE_DEAD_BEFORE_AUTO_RESTART: + case SERVICE_FAILED_BEFORE_AUTO_RESTART: + case SERVICE_DEAD: + case SERVICE_FAILED: + case SERVICE_DEAD_RESOURCES_PINNED: + default: + /* Unknown state, or unit_stop() should already have handled these */ + assert_not_reached(); + } +} + +static int service_reload(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED)); + + service_enter_reload(s); + return 1; +} + +static bool service_can_reload(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return s->exec_command[SERVICE_EXEC_RELOAD] || + s->type == SERVICE_NOTIFY_RELOAD; +} + +static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, const ExecCommand *current) { + Service *s = SERVICE(u); + unsigned idx = 0; + + assert(s); + assert(id >= 0); + assert(id < _SERVICE_EXEC_COMMAND_MAX); + + const ExecCommand *first = s->exec_command[id]; + + /* Figure out where we are in the list by walking back to the beginning */ + for (const ExecCommand *c = current; c != first; c = c->command_prev) + idx++; + + return idx; +} + +static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *command) { + _cleanup_free_ char *args = NULL, *p = NULL; + Service *s = SERVICE(u); + const char *type, *key; + ServiceExecCommand id; + size_t length = 0; + unsigned idx; + + assert(s); + assert(f); + + if (!command) + return 0; + + if (command == s->control_command) { + type = "control"; + id = s->control_command_id; + } else { + type = "main"; + id = SERVICE_EXEC_START; + } + + idx = service_exec_command_index(u, id, command); + + STRV_FOREACH(arg, command->argv) { + _cleanup_free_ char *e = NULL; + size_t n; + + e = cescape(*arg); + if (!e) + return log_oom(); + + n = strlen(e); + if (!GREEDY_REALLOC(args, length + 2 + n + 2)) + return log_oom(); + + if (length > 0) + args[length++] = ' '; + + args[length++] = '"'; + memcpy(args + length, e, n); + length += n; + args[length++] = '"'; + } + + if (!GREEDY_REALLOC(args, length + 1)) + return log_oom(); + + args[length++] = 0; + + p = cescape(command->path); + if (!p) + return log_oom(); + + key = strjoina(type, "-command"); + + /* We use '+1234' instead of '1234' to mark the last command in a sequence. + * This is used in service_deserialize_exec_command(). */ + (void) serialize_item_format( + f, key, + "%s %s%u %s %s", + service_exec_command_to_string(id), + command->command_next ? "" : "+", + idx, + p, args); + + return 0; +} + +static int service_serialize(Unit *u, FILE *f, FDSet *fds) { + Service *s = SERVICE(u); + int r; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", service_state_to_string(s->state)); + (void) serialize_item(f, "result", service_result_to_string(s->result)); + (void) serialize_item(f, "reload-result", service_result_to_string(s->reload_result)); + + (void) serialize_pidref(f, fds, "control-pid", &s->control_pid); + if (s->main_pid_known) + (void) serialize_pidref(f, fds, "main-pid", &s->main_pid); + + (void) serialize_bool(f, "main-pid-known", s->main_pid_known); + (void) serialize_bool(f, "bus-name-good", s->bus_name_good); + (void) serialize_bool(f, "bus-name-owner", s->bus_name_owner); + + (void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts); + (void) serialize_bool(f, "flush-n-restarts", s->flush_n_restarts); + + r = serialize_item_escaped(f, "status-text", s->status_text); + if (r < 0) + return r; + + service_serialize_exec_command(u, f, s->control_command); + service_serialize_exec_command(u, f, s->main_command); + + r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd); + if (r < 0) + return r; + + if (s->exec_fd_event_source) { + r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source)); + if (r < 0) + return r; + + (void) serialize_bool(f, "exec-fd-hot", s->exec_fd_hot); + } + + if (UNIT_ISSET(s->accept_socket)) { + r = serialize_item(f, "accept-socket", UNIT_DEREF(s->accept_socket)->id); + if (r < 0) + return r; + } + + r = serialize_fd(f, fds, "socket-fd", s->socket_fd); + if (r < 0) + return r; + + LIST_FOREACH(fd_store, fs, s->fd_store) { + _cleanup_free_ char *c = NULL; + int copy; + + copy = fdset_put_dup(fds, fs->fd); + if (copy < 0) + return log_error_errno(copy, "Failed to copy file descriptor for serialization: %m"); + + c = cescape(fs->fdname); + if (!c) + return log_oom(); + + (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll); + } + + if (s->main_exec_status.pid > 0) { + (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid); + (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp); + (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp); + + if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { + (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code); + (void) serialize_item_format(f, "main-exec-status-status", "%i", s->main_exec_status.status); + } + } + + if (s->notify_access_override >= 0) + (void) serialize_item(f, "notify-access-override", notify_access_to_string(s->notify_access_override)); + + (void) serialize_dual_timestamp(f, "watchdog-timestamp", &s->watchdog_timestamp); + (void) serialize_bool(f, "forbid-restart", s->forbid_restart); + + if (s->watchdog_override_enable) + (void) serialize_item_format(f, "watchdog-override-usec", USEC_FMT, s->watchdog_override_usec); + + if (s->watchdog_original_usec != USEC_INFINITY) + (void) serialize_item_format(f, "watchdog-original-usec", USEC_FMT, s->watchdog_original_usec); + + if (s->reload_begin_usec != USEC_INFINITY) + (void) serialize_item_format(f, "reload-begin-usec", USEC_FMT, s->reload_begin_usec); + + return 0; +} + +int service_deserialize_exec_command( + Unit *u, + const char *key, + const char *value) { + + Service *s = SERVICE(u); + int r; + unsigned idx = 0, i; + bool control, found = false, last = false; + ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID; + ExecCommand *command = NULL; + _cleanup_free_ char *path = NULL; + _cleanup_strv_free_ char **argv = NULL; + + enum ExecCommandState { + STATE_EXEC_COMMAND_TYPE, + STATE_EXEC_COMMAND_INDEX, + STATE_EXEC_COMMAND_PATH, + STATE_EXEC_COMMAND_ARGS, + _STATE_EXEC_COMMAND_MAX, + _STATE_EXEC_COMMAND_INVALID = -EINVAL, + } state; + + assert(s); + assert(key); + assert(value); + + control = streq(key, "control-command"); + + state = STATE_EXEC_COMMAND_TYPE; + + for (;;) { + _cleanup_free_ char *arg = NULL; + + r = extract_first_word(&value, &arg, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE); + if (r < 0) + return r; + if (r == 0) + break; + + switch (state) { + case STATE_EXEC_COMMAND_TYPE: + id = service_exec_command_from_string(arg); + if (id < 0) + return id; + + state = STATE_EXEC_COMMAND_INDEX; + break; + case STATE_EXEC_COMMAND_INDEX: + /* PID 1234 is serialized as either '1234' or '+1234'. The second form is used to + * mark the last command in a sequence. We warn if the deserialized command doesn't + * match what we have loaded from the unit, but we don't need to warn if that is the + * last command. */ + + r = safe_atou(arg, &idx); + if (r < 0) + return r; + last = arg[0] == '+'; + + state = STATE_EXEC_COMMAND_PATH; + break; + case STATE_EXEC_COMMAND_PATH: + path = TAKE_PTR(arg); + state = STATE_EXEC_COMMAND_ARGS; + break; + case STATE_EXEC_COMMAND_ARGS: + r = strv_extend(&argv, arg); + if (r < 0) + return -ENOMEM; + break; + default: + assert_not_reached(); + } + } + + if (state != STATE_EXEC_COMMAND_ARGS) + return -EINVAL; + if (strv_isempty(argv)) + return -EINVAL; /* At least argv[0] must be always present. */ + + /* Let's check whether exec command on given offset matches data that we just deserialized */ + for (command = s->exec_command[id], i = 0; command; command = command->command_next, i++) { + if (i != idx) + continue; + + found = strv_equal(argv, command->argv) && streq(command->path, path); + break; + } + + if (!found) { + /* Command at the index we serialized is different, let's look for command that exactly + * matches but is on different index. If there is no such command we will not resume execution. */ + for (command = s->exec_command[id]; command; command = command->command_next) + if (strv_equal(command->argv, argv) && streq(command->path, path)) + break; + } + + if (command && control) { + s->control_command = command; + s->control_command_id = id; + } else if (command) + s->main_command = command; + else if (last) + log_unit_debug(u, "Current command vanished from the unit file."); + else + log_unit_warning(u, "Current command vanished from the unit file, execution of the command list won't be resumed."); + + return 0; +} + +static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Service *s = SERVICE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + ServiceState state; + + state = service_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + ServiceResult f; + + f = service_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SERVICE_SUCCESS) + s->result = f; + + } else if (streq(key, "reload-result")) { + ServiceResult f; + + f = service_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse reload result value: %s", value); + else if (f != SERVICE_SUCCESS) + s->reload_result = f; + + } else if (streq(key, "control-pid")) { + pidref_done(&s->control_pid); + + (void) deserialize_pidref(fds, value, &s->control_pid); + + } else if (streq(key, "main-pid")) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + + if (deserialize_pidref(fds, value, &pidref) >= 0) + (void) service_set_main_pidref(s, &pidref); + + } else if (streq(key, "main-pid-known")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse main-pid-known value: %s", value); + else + s->main_pid_known = b; + } else if (streq(key, "bus-name-good")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse bus-name-good value: %s", value); + else + s->bus_name_good = b; + } else if (streq(key, "bus-name-owner")) { + r = free_and_strdup(&s->bus_name_owner, value); + if (r < 0) + log_unit_error_errno(u, r, "Unable to deserialize current bus owner %s: %m", value); + } else if (streq(key, "status-text")) { + char *t; + ssize_t l; + + l = cunescape(value, 0, &t); + if (l < 0) + log_unit_debug_errno(u, l, "Failed to unescape status text '%s': %m", value); + else + free_and_replace(s->status_text, t); + + } else if (streq(key, "accept-socket")) { + Unit *socket; + + if (u->type != UNIT_SOCKET) { + log_unit_debug(u, "Failed to deserialize accept-socket: unit is not a socket"); + return 0; + } + + r = manager_load_unit(u->manager, value, NULL, NULL, &socket); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to load accept-socket unit '%s': %m", value); + else { + unit_ref_set(&s->accept_socket, u, socket); + SOCKET(socket)->n_connections++; + } + + } else if (streq(key, "socket-fd")) { + asynchronous_close(s->socket_fd); + s->socket_fd = deserialize_fd(fds, value); + + } else if (streq(key, "fd-store-fd")) { + _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL; + _cleanup_close_ int fd = -EBADF; + int do_poll; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value); + return 0; + } + + fd = deserialize_fd(fds, fdv); + if (fd < 0) + return 0; + + r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE); + if (r <= 0) { + log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value); + return 0; + } + + r = extract_first_word(&value, &fdp, NULL, 0); + if (r == 0) { + /* If the value is not present, we assume the default */ + do_poll = 1; + } else if (r < 0 || (r = safe_atoi(fdp, &do_poll)) < 0) { + log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\", ignoring: %m", value); + return 0; + } + + r = service_add_fd_store(s, fd, fdn, do_poll); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to store deserialized fd %i, ignoring: %m", fd); + return 0; + } + + TAKE_FD(fd); + } else if (streq(key, "main-exec-status-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-pid value: %s", value); + else + s->main_exec_status.pid = pid; + } else if (streq(key, "main-exec-status-code")) { + int i; + + if (safe_atoi(value, &i) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-code value: %s", value); + else + s->main_exec_status.code = i; + } else if (streq(key, "main-exec-status-status")) { + int i; + + if (safe_atoi(value, &i) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-status value: %s", value); + else + s->main_exec_status.status = i; + } else if (streq(key, "main-exec-status-start")) + deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp); + else if (streq(key, "main-exec-status-exit")) + deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp); + else if (streq(key, "notify-access-override")) { + NotifyAccess notify_access; + + notify_access = notify_access_from_string(value); + if (notify_access < 0) + log_unit_debug(u, "Failed to parse notify-access-override value: %s", value); + else + s->notify_access_override = notify_access; + } else if (streq(key, "watchdog-timestamp")) + deserialize_dual_timestamp(value, &s->watchdog_timestamp); + else if (streq(key, "forbid-restart")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse forbid-restart value: %s", value); + else + s->forbid_restart = b; + } else if (streq(key, "stdin-fd")) { + + asynchronous_close(s->stdin_fd); + s->stdin_fd = deserialize_fd(fds, value); + if (s->stdin_fd >= 0) + s->exec_context.stdio_as_fds = true; + + } else if (streq(key, "stdout-fd")) { + + asynchronous_close(s->stdout_fd); + s->stdout_fd = deserialize_fd(fds, value); + if (s->stdout_fd >= 0) + s->exec_context.stdio_as_fds = true; + + } else if (streq(key, "stderr-fd")) { + + asynchronous_close(s->stderr_fd); + s->stderr_fd = deserialize_fd(fds, value); + if (s->stderr_fd >= 0) + s->exec_context.stdio_as_fds = true; + + } else if (streq(key, "exec-fd")) { + _cleanup_close_ int fd = -EBADF; + + fd = deserialize_fd(fds, value); + if (fd >= 0) { + s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); + + if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) >= 0) + TAKE_FD(fd); + } + + } else if (streq(key, "watchdog-override-usec")) { + if (deserialize_usec(value, &s->watchdog_override_usec) < 0) + log_unit_debug(u, "Failed to parse watchdog_override_usec value: %s", value); + else + s->watchdog_override_enable = true; + + } else if (streq(key, "watchdog-original-usec")) { + if (deserialize_usec(value, &s->watchdog_original_usec) < 0) + log_unit_debug(u, "Failed to parse watchdog_original_usec value: %s", value); + + } else if (STR_IN_SET(key, "main-command", "control-command")) { + r = service_deserialize_exec_command(u, key, value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized command \"%s\": %m", value); + + } else if (streq(key, "n-restarts")) { + r = safe_atou(value, &s->n_restarts); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value); + + } else if (streq(key, "flush-n-restarts")) { + r = parse_boolean(value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized flush restart counter setting '%s': %m", value); + else + s->flush_n_restarts = r; + } else if (streq(key, "reload-begin-usec")) { + r = deserialize_usec(value, &s->reload_begin_usec); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized reload begin timestamp '%s', ignoring: %m", value); + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState service_active_state(Unit *u) { + const UnitActiveState *table; + + assert(u); + + table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + + return table[SERVICE(u)->state]; +} + +static const char *service_sub_state_to_string(Unit *u) { + assert(u); + + return service_state_to_string(SERVICE(u)->state); +} + +static bool service_may_gc(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* Never clean up services that still have a process around, even if the service is formally dead. Note that + * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they + * have moved outside of the cgroup. */ + + if (main_pid_good(s) > 0 || + control_pid_good(s) > 0) + return false; + + /* Only allow collection of actually dead services, i.e. not those that are in the transitionary + * SERVICE_DEAD_BEFORE_AUTO_RESTART/SERVICE_FAILED_BEFORE_AUTO_RESTART states. */ + if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED)) + return false; + + return true; +} + +static int service_retry_pid_file(Service *s) { + int r; + + assert(s->pid_file); + assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); + + r = service_load_pid_file(s, false); + if (r < 0) + return r; + + service_unwatch_pid_file(s); + + service_enter_running(s, SERVICE_SUCCESS); + return 0; +} + +static int service_watch_pid_file(Service *s) { + int r; + + log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path); + + r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to set a watch for PID file %s: %m", s->pid_file_pathspec->path); + service_unwatch_pid_file(s); + return r; + } + + /* the pidfile might have appeared just before we set the watch */ + log_unit_debug(UNIT(s), "Trying to read PID file %s in case it changed", s->pid_file_pathspec->path); + service_retry_pid_file(s); + + return 0; +} + +static int service_demand_pid_file(Service *s) { + _cleanup_free_ PathSpec *ps = NULL; + + assert(s->pid_file); + assert(!s->pid_file_pathspec); + + ps = new(PathSpec, 1); + if (!ps) + return -ENOMEM; + + *ps = (PathSpec) { + .unit = UNIT(s), + .path = strdup(s->pid_file), + /* PATH_CHANGED would not be enough. There are daemons (sendmail) that keep their PID file + * open all the time. */ + .type = PATH_MODIFIED, + .inotify_fd = -EBADF, + }; + + if (!ps->path) + return -ENOMEM; + + path_simplify(ps->path); + + s->pid_file_pathspec = TAKE_PTR(ps); + + return service_watch_pid_file(s); +} + +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + PathSpec *p = ASSERT_PTR(userdata); + Service *s; + + s = SERVICE(p->unit); + + assert(s); + assert(fd >= 0); + assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); + assert(s->pid_file_pathspec); + assert(path_spec_owns_inotify_fd(s->pid_file_pathspec, fd)); + + log_unit_debug(UNIT(s), "inotify event"); + + if (path_spec_fd_event(p, events) < 0) + goto fail; + + if (service_retry_pid_file(s) == 0) + return 0; + + if (service_watch_pid_file(s) < 0) + goto fail; + + return 0; + +fail: + service_unwatch_pid_file(s); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + return 0; +} + +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + + log_unit_debug(UNIT(s), "got exec-fd event"); + + /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() + * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically + * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the + * parent. We need to be careful however, as there are other reasons that we might cause the child's side of + * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless + * the child signalled us first that it is about to call the execve(). It does so by sending us a simple + * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it + * sends a zero byte we'll ignore POLLHUP on the fd again. */ + + for (;;) { + uint8_t x; + ssize_t n; + + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for */ + + s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); + + if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ + log_unit_debug(UNIT(s), "Got EOF on exec-fd"); + + s->exec_fd_hot = false; + + /* Nice! This is what we have been waiting for. Transition to next state. */ + if (s->type == SERVICE_EXEC && s->state == SERVICE_START) + service_enter_start_post(s); + } else + log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring."); + + return 0; + } + + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); + s->exec_fd_hot = x; + } + + return 0; +} + +static void service_notify_cgroup_empty_event(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + log_unit_debug(u, "Control group is empty."); + + switch (s->state) { + + /* Waiting for SIGCHLD is usually more interesting, because it includes return + * codes/signals. Which is why we ignore the cgroup events for most cases, except when we + * don't know pid which to expect the SIGCHLD for. */ + + case SERVICE_START: + if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) && + main_pid_good(s) == 0 && + control_pid_good(s) == 0) { + /* No chance of getting a ready notification anymore */ + service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL); + break; + } + + if (s->exit_type == SERVICE_EXIT_CGROUP && main_pid_good(s) <= 0) + service_enter_start_post(s); + + _fallthrough_; + case SERVICE_START_POST: + if (s->pid_file_pathspec && + main_pid_good(s) == 0 && + control_pid_good(s) == 0) { + + /* Give up hoping for the daemon to write its PID file */ + log_unit_warning(u, "Daemon never wrote its PID file. Failing."); + + service_unwatch_pid_file(s); + if (s->state == SERVICE_START) + service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL); + else + service_enter_stop(s, SERVICE_FAILURE_PROTOCOL); + } + break; + + case SERVICE_RUNNING: + /* service_enter_running() will figure out what to do */ + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0) + service_enter_stop_post(s, SERVICE_SUCCESS); + + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0) + service_enter_dead(s, SERVICE_SUCCESS, true); + + break; + + /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean + * up the cgroup earlier and should do it now. */ + case SERVICE_AUTO_RESTART: + case SERVICE_AUTO_RESTART_QUEUED: + unit_prune_cgroup(u); + break; + + default: + ; + } +} + +static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) { + Service *s = SERVICE(u); + + if (managed_oom) + log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd."); + else + log_unit_debug(u, "Process of control group was killed by the OOM killer."); + + if (s->oom_policy == OOM_CONTINUE) + return; + + switch (s->state) { + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_STOP: + if (s->oom_policy == OOM_STOP) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL); + else if (s->oom_policy == OOM_KILL) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + + break; + + case SERVICE_EXITED: + case SERVICE_RUNNING: + if (s->oom_policy == OOM_STOP) + service_enter_stop(s, SERVICE_FAILURE_OOM_KILL); + else if (s->oom_policy == OOM_KILL) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + break; + + case SERVICE_STOP_SIGKILL: + case SERVICE_FINAL_SIGKILL: + if (s->result == SERVICE_SUCCESS) + s->result = SERVICE_FAILURE_OOM_KILL; + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL); + break; + + default: + ; + } +} + +static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { + bool notify_dbus = true; + Service *s = SERVICE(u); + ServiceResult f; + ExitClean clean_mode; + + assert(s); + assert(pid >= 0); + + /* Oneshot services and non-SERVICE_EXEC_START commands should not be + * considered daemons as they are typically not long running. */ + if (s->type == SERVICE_ONESHOT || (s->control_pid.pid == pid && s->control_command_id != SERVICE_EXEC_START)) + clean_mode = EXIT_CLEAN_COMMAND; + else + clean_mode = EXIT_CLEAN_DAEMON; + + if (is_clean_exit(code, status, clean_mode, &s->success_status)) + f = SERVICE_SUCCESS; + else if (code == CLD_EXITED) + f = SERVICE_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SERVICE_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SERVICE_FAILURE_CORE_DUMP; + else + assert_not_reached(); + + if (s->main_pid.pid == pid) { + /* Clean up the exec_fd event source. We want to do this here, not later in + * service_set_state(), because service_enter_stop_post() calls service_spawn(). + * The source owns its end of the pipe, so this will close that too. */ + s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); + + /* Forking services may occasionally move to a new PID. + * As long as they update the PID file before exiting the old + * PID, they're fine. */ + if (service_load_pid_file(s, false) > 0) + return; + + pidref_done(&s->main_pid); + exec_status_exit(&s->main_exec_status, &s->exec_context, pid, code, status); + + if (s->main_command) { + /* If this is not a forking service than the + * main process got started and hence we copy + * the exit status so that it is recorded both + * as main and as control process exit + * status */ + + s->main_command->exec_status = s->main_exec_status; + + if (s->main_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } else if (s->exec_command[SERVICE_EXEC_START]) { + + /* If this is a forked process, then we should + * ignore the return value if this was + * configured for the starter process */ + + if (s->exec_command[SERVICE_EXEC_START]->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } + + unit_log_process_exit( + u, + "Main process", + service_exec_command_to_string(SERVICE_EXEC_START), + f == SERVICE_SUCCESS, + code, status); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->main_command && + s->main_command->command_next && + s->type == SERVICE_ONESHOT && + f == SERVICE_SUCCESS) { + + /* There is another command to execute, so let's do that. */ + + log_unit_debug(u, "Running next main command for state %s.", service_state_to_string(s->state)); + service_run_next_main(s); + + } else { + s->main_command = NULL; + + /* Services with ExitType=cgroup do not act on main PID exiting, unless the cgroup is + * already empty */ + if (s->exit_type == SERVICE_EXIT_MAIN || cgroup_good(s) <= 0) { + /* The service exited, so the service is officially gone. */ + switch (s->state) { + + case SERVICE_START_POST: + case SERVICE_RELOAD: + case SERVICE_RELOAD_SIGNAL: + case SERVICE_RELOAD_NOTIFY: + /* If neither main nor control processes are running then the current + * state can never exit cleanly, hence immediately terminate the + * service. */ + if (control_pid_good(s) <= 0) + service_enter_stop(s, f); + + /* Otherwise need to wait until the operation is done. */ + break; + + case SERVICE_STOP: + /* Need to wait until the operation is done. */ + break; + + case SERVICE_START: + if (s->type == SERVICE_ONESHOT) { + /* This was our main goal, so let's go on */ + if (f == SERVICE_SUCCESS) + service_enter_start_post(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } else if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD)) { + /* Only enter running through a notification, so that the + * SERVICE_START state signifies that no ready notification + * has been received */ + if (f != SERVICE_SUCCESS) + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + else if (!s->remain_after_exit || service_get_notify_access(s) == NOTIFY_MAIN) + /* The service has never been and will never be active */ + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL); + break; + } + + _fallthrough_; + case SERVICE_RUNNING: + service_enter_running(s, f); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (control_pid_good(s) <= 0) + service_enter_stop_post(s, f); + + /* If there is still a control process, wait for that first */ + break; + + case SERVICE_STOP_POST: + + if (control_pid_good(s) <= 0) + service_enter_signal(s, SERVICE_FINAL_SIGTERM, f); + + break; + + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + + if (control_pid_good(s) <= 0) + service_enter_dead(s, f, true); + break; + + default: + assert_not_reached(); + } + } else if (s->exit_type == SERVICE_EXIT_CGROUP && s->state == SERVICE_START) + /* If a main process exits very quickly, this function might be executed + * before service_dispatch_exec_io(). Since this function disabled IO events + * to monitor the main process above, we need to update the state here too. + * Let's consider the process is successfully launched and exited. */ + service_enter_start_post(s); + } + + } else if (s->control_pid.pid == pid) { + const char *kind; + bool success; + + pidref_done(&s->control_pid); + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } + + /* ExecCondition= calls that exit with (0, 254] should invoke skip-like behavior instead of failing */ + if (s->state == SERVICE_CONDITION) { + if (f == SERVICE_FAILURE_EXIT_CODE && status < 255) { + UNIT(s)->condition_result = false; + f = SERVICE_SKIP_CONDITION; + success = true; + } else if (f == SERVICE_SUCCESS) { + UNIT(s)->condition_result = true; + success = true; + } else + success = false; + + kind = "Condition check process"; + } else { + kind = "Control process"; + success = f == SERVICE_SUCCESS; + } + + unit_log_process_exit( + u, + kind, + service_exec_command_to_string(s->control_command_id), + success, + code, status); + + if (s->state != SERVICE_RELOAD && s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->control_command && + s->control_command->command_next && + f == SERVICE_SUCCESS) { + + /* There is another command to * execute, so let's do that. */ + + log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state)); + service_run_next_control(s); + + } else { + /* No further commands for this step, so let's figure out what to do next */ + + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + log_unit_debug(u, "Got final SIGCHLD for state %s.", service_state_to_string(s->state)); + + switch (s->state) { + + case SERVICE_CONDITION: + if (f == SERVICE_SUCCESS) + service_enter_start_pre(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_START_PRE: + if (f == SERVICE_SUCCESS) + service_enter_start(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_START: + if (s->type != SERVICE_FORKING) + /* Maybe spurious event due to a reload that changed the type? */ + break; + + if (f != SERVICE_SUCCESS) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } + + if (s->pid_file) { + bool has_start_post; + int r; + + /* Let's try to load the pid file here if we can. + * The PID file might actually be created by a START_POST + * script. In that case don't worry if the loading fails. */ + + has_start_post = s->exec_command[SERVICE_EXEC_START_POST]; + r = service_load_pid_file(s, !has_start_post); + if (!has_start_post && r < 0) { + r = service_demand_pid_file(s); + if (r < 0 || cgroup_good(s) == 0) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL); + break; + } + } else + service_search_main_pid(s); + + service_enter_start_post(s); + break; + + case SERVICE_START_POST: + if (f != SERVICE_SUCCESS) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } + + if (s->pid_file) { + int r; + + r = service_load_pid_file(s, true); + if (r < 0) { + r = service_demand_pid_file(s); + if (r < 0 || cgroup_good(s) == 0) + service_enter_stop(s, SERVICE_FAILURE_PROTOCOL); + break; + } + } else + service_search_main_pid(s); + + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_RELOAD: + case SERVICE_RELOAD_SIGNAL: + case SERVICE_RELOAD_NOTIFY: + if (f == SERVICE_SUCCESS) + if (service_load_pid_file(s, true) < 0) + service_search_main_pid(s); + + s->reload_result = f; + + /* If the last notification we received from the service process indicates + * we are still reloading, then don't leave reloading state just yet, just + * transition into SERVICE_RELOAD_NOTIFY, to wait for the READY=1 coming, + * too. */ + if (s->notify_state == NOTIFY_RELOADING) + service_set_state(s, SERVICE_RELOAD_NOTIFY); + else + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP: + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + if (main_pid_good(s) <= 0) + service_enter_stop_post(s, f); + + /* If there is still a service process around, wait until + * that one quit, too */ + break; + + case SERVICE_STOP_POST: + if (main_pid_good(s) <= 0) + service_enter_signal(s, SERVICE_FINAL_SIGTERM, f); + break; + + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0) + service_enter_dead(s, f, true); + break; + + case SERVICE_CLEANING: + + if (s->clean_result == SERVICE_SUCCESS) + s->clean_result = f; + + service_enter_dead(s, SERVICE_SUCCESS, false); + break; + + default: + assert_not_reached(); + } + } + } else /* Neither control nor main PID? If so, don't notify about anything */ + notify_dbus = false; + + /* Notify clients about changed exit status */ + if (notify_dbus) + unit_add_to_dbus_queue(u); + + /* We watch the main/control process otherwise we can't retrieve the unit they + * belong to with cgroupv1. But if they are not our direct child, we won't get a + * SIGCHLD for them. Therefore we need to look for others to watch so we can + * detect when the cgroup becomes empty. Note that the control process is always + * our child so it's pointless to watch all other processes. */ + if (!control_pid_good(s)) + if (!s->main_pid_known || s->main_pid_alien) + (void) unit_enqueue_rewatch_pids(u); +} + +static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + assert(source == s->timer_event_source); + + switch (s->state) { + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + switch (s->timeout_start_failure_mode) { + + case SERVICE_TIMEOUT_TERMINATE: + log_unit_warning(UNIT(s), "%s operation timed out. Terminating.", service_state_to_string(s->state)); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_ABORT: + log_unit_warning(UNIT(s), "%s operation timed out. Aborting.", service_state_to_string(s->state)); + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_KILL: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "%s operation timed out. Killing.", service_state_to_string(s->state)); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "%s operation timed out. Skipping SIGKILL.", service_state_to_string(s->state)); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + break; + + default: + assert_not_reached(); + } + break; + + case SERVICE_RUNNING: + log_unit_warning(UNIT(s), "Service reached runtime time limit. Stopping."); + service_enter_stop(s, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_RELOAD: + case SERVICE_RELOAD_SIGNAL: + case SERVICE_RELOAD_NOTIFY: + log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process."); + service_kill_control_process(s); + s->reload_result = SERVICE_FAILURE_TIMEOUT; + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP: + switch (s->timeout_stop_failure_mode) { + + case SERVICE_TIMEOUT_TERMINATE: + log_unit_warning(UNIT(s), "Stopping timed out. Terminating."); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_ABORT: + log_unit_warning(UNIT(s), "Stopping timed out. Aborting."); + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_KILL: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + break; + + default: + assert_not_reached(); + } + break; + + case SERVICE_STOP_WATCHDOG: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + break; + + case SERVICE_STOP_SIGTERM: + if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Aborting."); + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + } else if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + + break; + + case SERVICE_STOP_SIGKILL: + /* Uh, we sent a SIGKILL and it is still not gone? + * Must be something we cannot kill, so let's just be + * weirded out and continue */ + + log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_STOP_POST: + switch (s->timeout_stop_failure_mode) { + + case SERVICE_TIMEOUT_TERMINATE: + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Terminating."); + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_ABORT: + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Aborting."); + service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_KILL: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + break; + + default: + assert_not_reached(); + } + break; + + case SERVICE_FINAL_WATCHDOG: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + break; + + case SERVICE_FINAL_SIGTERM: + if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) { + log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Aborting."); + service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + } else if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + + break; + + case SERVICE_FINAL_SIGKILL: + log_unit_warning(UNIT(s), "Processes still around after final SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, true); + break; + + case SERVICE_AUTO_RESTART: + if (s->restart_usec > 0) + log_unit_debug(UNIT(s), + "Service restart interval %s expired, scheduling restart.", + FORMAT_TIMESPAN(service_restart_usec_next(s), USEC_PER_SEC)); + else + log_unit_debug(UNIT(s), + "Service has no hold-off time (RestartSec=0), scheduling restart."); + + service_enter_restart(s); + break; + + case SERVICE_CLEANING: + log_unit_warning(UNIT(s), "Cleaning timed out. killing."); + + if (s->clean_result == SERVICE_SUCCESS) + s->clean_result = SERVICE_FAILURE_TIMEOUT; + + service_enter_signal(s, SERVICE_FINAL_SIGKILL, 0); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) { + Service *s = SERVICE(userdata); + usec_t watchdog_usec; + + assert(s); + assert(source == s->watchdog_event_source); + + watchdog_usec = service_get_watchdog_usec(s); + + if (UNIT(s)->manager->service_watchdogs) { + log_unit_error(UNIT(s), "Watchdog timeout (limit %s)!", + FORMAT_TIMESPAN(watchdog_usec, 1)); + + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); + } else + log_unit_warning(UNIT(s), "Watchdog disabled! Ignoring watchdog timeout (limit %s)!", + FORMAT_TIMESPAN(watchdog_usec, 1)); + + return 0; +} + +static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) { + assert(s); + + NotifyAccess notify_access = service_get_notify_access(s); + + if (notify_access == NOTIFY_NONE) { + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid); + return false; + } + + if (notify_access == NOTIFY_MAIN && pid != s->main_pid.pid) { + if (pidref_is_set(&s->main_pid)) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid); + else + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid); + + return false; + } + + if (notify_access == NOTIFY_EXEC && pid != s->main_pid.pid && pid != s->control_pid.pid) { + if (pidref_is_set(&s->main_pid) && pidref_is_set(&s->control_pid)) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT, + pid, s->main_pid.pid, s->control_pid.pid); + else if (pidref_is_set(&s->main_pid)) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid); + else if (pidref_is_set(&s->control_pid)) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid); + else + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid); + + return false; + } + + return true; +} + +static void service_force_watchdog(Service *s) { + if (!UNIT(s)->manager->service_watchdogs) + return; + + log_unit_error(UNIT(s), "Watchdog request (last status: %s)!", + s->status_text ?: ""); + + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); +} + +static void service_notify_message( + Unit *u, + const struct ucred *ucred, + char * const *tags, + FDSet *fds) { + + Service *s = SERVICE(u); + bool notify_dbus = false; + usec_t monotonic_usec = USEC_INFINITY; + const char *e; + int r; + + assert(u); + assert(ucred); + + if (!service_notify_message_authorized(s, ucred->pid, fds)) + return; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *cc = NULL; + + cc = strv_join(tags, ", "); + log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, empty_to_na(cc)); + } + + /* Interpret MAINPID= */ + e = strv_find_startswith(tags, "MAINPID="); + if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) { + _cleanup_(pidref_done) PidRef new_main_pid = PIDREF_NULL; + + r = pidref_set_pidstr(&new_main_pid, e); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to parse MAINPID=%s field in notification message, ignoring: %m", e); + else if (!s->main_pid_known || !pidref_equal(&new_main_pid, &s->main_pid)) { + + r = service_is_suitable_main_pid(s, &new_main_pid, LOG_WARNING); + if (r == 0) { + /* The new main PID is a bit suspicious, which is OK if the sender is privileged. */ + + if (ucred->uid == 0) { + log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid.pid); + r = 1; + } else + log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid); + } + if (r > 0) { + (void) service_set_main_pidref(s, &new_main_pid); + + r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to watch new main PID "PID_FMT" for service: %m", s->main_pid.pid); + + notify_dbus = true; + } + } + } + + /* Parse MONOTONIC_USEC= */ + e = strv_find_startswith(tags, "MONOTONIC_USEC="); + if (e) { + r = safe_atou64(e, &monotonic_usec); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to parse MONOTONIC_USEC= field in notification message, ignoring: %s", e); + } + + /* Interpret READY=/STOPPING=/RELOADING=. STOPPING= wins over the others, and READY= over RELOADING= */ + if (strv_contains(tags, "STOPPING=1")) { + s->notify_state = NOTIFY_STOPPING; + + if (IN_SET(s->state, SERVICE_RUNNING, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) + service_enter_stop_by_notify(s); + + notify_dbus = true; + + } else if (strv_contains(tags, "READY=1")) { + + s->notify_state = NOTIFY_READY; + + /* Type=notify services inform us about completed initialization with READY=1 */ + if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) && + s->state == SERVICE_START) + service_enter_start_post(s); + + /* Sending READY=1 while we are reloading informs us that the reloading is complete. */ + if (s->state == SERVICE_RELOAD_NOTIFY) + service_enter_running(s, SERVICE_SUCCESS); + + /* Combined RELOADING=1 and READY=1? Then this is indication that the service started and + * immediately finished reloading. */ + if (s->state == SERVICE_RELOAD_SIGNAL && + strv_contains(tags, "RELOADING=1") && + monotonic_usec != USEC_INFINITY && + monotonic_usec >= s->reload_begin_usec) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + /* Propagate a reload explicitly */ + r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error); + if (r < 0) + log_unit_warning(UNIT(s), "Failed to schedule propagation of reload, ignoring: %s", bus_error_message(&error, r)); + + service_enter_running(s, SERVICE_SUCCESS); + } + + notify_dbus = true; + + } else if (strv_contains(tags, "RELOADING=1")) { + + s->notify_state = NOTIFY_RELOADING; + + /* Sending RELOADING=1 after we send SIGHUP to request a reload will transition + * things to "reload-notify" state, where we'll wait for READY=1 to let us know the + * reload is done. Note that we insist on a timestamp being sent along here, so that + * we know for sure this is a reload cycle initiated *after* we sent the signal */ + if (s->state == SERVICE_RELOAD_SIGNAL && + monotonic_usec != USEC_INFINITY && + monotonic_usec >= s->reload_begin_usec) + /* Note, we don't call service_enter_reload_by_notify() here, because we + * don't need reload propagation nor do we want to restart the time-out. */ + service_set_state(s, SERVICE_RELOAD_NOTIFY); + + if (s->state == SERVICE_RUNNING) + service_enter_reload_by_notify(s); + + notify_dbus = true; + } + + /* Interpret STATUS= */ + e = strv_find_startswith(tags, "STATUS="); + if (e) { + _cleanup_free_ char *t = NULL; + + if (!isempty(e)) { + /* Note that this size limit check is mostly paranoia: since the datagram size we are willing + * to process is already limited to NOTIFY_BUFFER_MAX, this limit here should never be hit. */ + if (strlen(e) > STATUS_TEXT_MAX) + log_unit_warning(u, "Status message overly long (%zu > %u), ignoring.", strlen(e), STATUS_TEXT_MAX); + else if (!utf8_is_valid(e)) + log_unit_warning(u, "Status message in notification message is not UTF-8 clean, ignoring."); + else { + t = strdup(e); + if (!t) + log_oom(); + } + } + + if (!streq_ptr(s->status_text, t)) { + free_and_replace(s->status_text, t); + notify_dbus = true; + } + } + + /* Interpret NOTIFYACCESS= */ + e = strv_find_startswith(tags, "NOTIFYACCESS="); + if (e) { + NotifyAccess notify_access; + + notify_access = notify_access_from_string(e); + if (notify_access < 0) + log_unit_warning_errno(u, notify_access, + "Failed to parse NOTIFYACCESS= field value '%s' in notification message, ignoring: %m", e); + + /* We don't need to check whether the new access mode is more strict than what is + * already in use, since only the privileged process is allowed to change it + * in the first place. */ + if (service_get_notify_access(s) != notify_access) { + service_override_notify_access(s, notify_access); + notify_dbus = true; + } + } + + /* Interpret ERRNO= */ + e = strv_find_startswith(tags, "ERRNO="); + if (e) { + int status_errno; + + status_errno = parse_errno(e); + if (status_errno < 0) + log_unit_warning_errno(u, status_errno, + "Failed to parse ERRNO= field value '%s' in notification message: %m", e); + else if (s->status_errno != status_errno) { + s->status_errno = status_errno; + notify_dbus = true; + } + } + + /* Interpret EXTEND_TIMEOUT= */ + e = strv_find_startswith(tags, "EXTEND_TIMEOUT_USEC="); + if (e) { + usec_t extend_timeout_usec; + if (safe_atou64(e, &extend_timeout_usec) < 0) + log_unit_warning(u, "Failed to parse EXTEND_TIMEOUT_USEC=%s", e); + else + service_extend_timeout(s, extend_timeout_usec); + } + + /* Interpret WATCHDOG= */ + e = strv_find_startswith(tags, "WATCHDOG="); + if (e) { + if (streq(e, "1")) + service_reset_watchdog(s); + else if (streq(e, "trigger")) + service_force_watchdog(s); + else + log_unit_warning(u, "Passed WATCHDOG= field is invalid, ignoring."); + } + + e = strv_find_startswith(tags, "WATCHDOG_USEC="); + if (e) { + usec_t watchdog_override_usec; + if (safe_atou64(e, &watchdog_override_usec) < 0) + log_unit_warning(u, "Failed to parse WATCHDOG_USEC=%s", e); + else + service_override_watchdog_timeout(s, watchdog_override_usec); + } + + /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases, + * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing + * fds, but optional when pushing in new fds, for compatibility reasons. */ + if (strv_contains(tags, "FDSTOREREMOVE=1")) { + const char *name; + + name = strv_find_startswith(tags, "FDNAME="); + if (!name || !fdname_is_valid(name)) + log_unit_warning(u, "FDSTOREREMOVE=1 requested, but no valid file descriptor name passed, ignoring."); + else + service_remove_fd_store(s, name); + + } else if (strv_contains(tags, "FDSTORE=1")) { + const char *name; + + name = strv_find_startswith(tags, "FDNAME="); + if (name && !fdname_is_valid(name)) { + log_unit_warning(u, "Passed FDNAME= name is invalid, ignoring."); + name = NULL; + } + + (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0")); + } + + /* Notify clients about changed status or main pid */ + if (notify_dbus) + unit_add_to_dbus_queue(u); +} + +static int service_get_timeout(Unit *u, usec_t *timeout) { + Service *s = SERVICE(u); + uint64_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static usec_t service_get_timeout_start_usec(Unit *u) { + Service *s = SERVICE(ASSERT_PTR(u)); + return s->timeout_start_usec; +} + +static bool pick_up_pid_from_bus_name(Service *s) { + assert(s); + + /* If the service is running but we have no main PID yet, get it from the owner of the D-Bus name */ + + return !pidref_is_set(&s->main_pid) && + IN_SET(s->state, + SERVICE_START, + SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_RELOAD_SIGNAL, + SERVICE_RELOAD_NOTIFY); +} + +static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, sd_bus_error *ret_error) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + const sd_bus_error *e; + Unit *u = ASSERT_PTR(userdata); + uint32_t pid; + Service *s; + int r; + + assert(reply); + + s = SERVICE(u); + s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot); + + if (!s->bus_name || !pick_up_pid_from_bus_name(s)) + return 1; + + e = sd_bus_message_get_error(reply); + if (e) { + r = sd_bus_error_get_errno(e); + log_warning_errno(r, "GetConnectionUnixProcessID() failed: %s", bus_error_message(e, r)); + return 1; + } + + r = sd_bus_message_read(reply, "u", &pid); + if (r < 0) { + bus_log_parse_error(r); + return 1; + } + + r = pidref_set_pid(&pidref, pid); + if (r < 0) { + log_debug_errno(r, "GetConnectionUnixProcessID() returned invalid PID: %m"); + return 1; + } + + log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid); + + (void) service_set_main_pidref(s, &pidref); + (void) unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); + return 1; +} + +static void service_bus_name_owner_change(Unit *u, const char *new_owner) { + + Service *s = SERVICE(u); + int r; + + assert(s); + + if (new_owner) + log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner); + else + log_unit_debug(u, "D-Bus name %s now not owned by anyone.", s->bus_name); + + s->bus_name_good = new_owner; + + /* Track the current owner, so we can reconstruct changes after a daemon reload */ + r = free_and_strdup(&s->bus_name_owner, new_owner); + if (r < 0) { + log_unit_error_errno(u, r, "Unable to set new bus name owner %s: %m", new_owner); + return; + } + + if (s->type == SERVICE_DBUS) { + + /* service_enter_running() will figure out what to + * do */ + if (s->state == SERVICE_RUNNING) + service_enter_running(s, SERVICE_SUCCESS); + else if (s->state == SERVICE_START && new_owner) + service_enter_start_post(s); + + } else if (new_owner && pick_up_pid_from_bus_name(s)) { + + /* Try to acquire PID from bus service */ + + s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot); + + r = sd_bus_call_method_async( + u->manager->api_bus, + &s->bus_name_pid_lookup_slot, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionUnixProcessID", + bus_name_pid_lookup_callback, + s, + "s", + s->bus_name); + if (r < 0) + log_debug_errno(r, "Failed to request owner PID of service name, ignoring: %m"); + } +} + +int service_set_socket_fd( + Service *s, + int fd, + Socket *sock, + SocketPeer *peer, + bool selinux_context_net) { + + _cleanup_free_ char *peer_text = NULL; + int r; + + assert(s); + assert(fd >= 0); + + /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs + * to be configured. We take ownership of the passed fd on success. */ + + if (UNIT(s)->load_state != UNIT_LOADED) + return -EINVAL; + + if (s->socket_fd >= 0) + return -EBUSY; + + assert(!s->socket_peer); + + if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED)) + return -EAGAIN; + + if (getpeername_pretty(fd, true, &peer_text) >= 0) { + + if (UNIT(s)->description) { + _cleanup_free_ char *a = NULL; + + a = strjoin(UNIT(s)->description, " (", peer_text, ")"); + if (!a) + return -ENOMEM; + + r = unit_set_description(UNIT(s), a); + } else + r = unit_set_description(UNIT(s), peer_text); + if (r < 0) + return r; + } + + r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + + s->socket_fd = fd; + s->socket_peer = socket_peer_ref(peer); + s->socket_fd_selinux_context_net = selinux_context_net; + + unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock)); + return 0; +} + +static void service_reset_failed(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (s->state == SERVICE_FAILED) + service_set_state(s, service_determine_dead_state(s)); + + s->result = SERVICE_SUCCESS; + s->reload_result = SERVICE_SUCCESS; + s->clean_result = SERVICE_SUCCESS; + s->n_restarts = 0; + s->flush_n_restarts = false; +} + +static PidRef* service_main_pid(Unit *u) { + return &ASSERT_PTR(SERVICE(u))->main_pid; +} + +static PidRef* service_control_pid(Unit *u) { + return &ASSERT_PTR(SERVICE(u))->control_pid; +} + +static bool service_needs_console(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* We provide our own implementation of this here, instead of relying of the generic implementation + * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */ + + if (!exec_context_may_touch_console(&s->exec_context)) + return false; + + return IN_SET(s->state, + SERVICE_CONDITION, + SERVICE_START_PRE, + SERVICE_START, + SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_RELOAD_SIGNAL, + SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, + SERVICE_STOP_WATCHDOG, + SERVICE_STOP_SIGTERM, + SERVICE_STOP_SIGKILL, + SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, + SERVICE_FINAL_SIGTERM, + SERVICE_FINAL_SIGKILL); +} + +static int service_exit_status(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + if (s->main_exec_status.pid <= 0 || + !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) + return -ENODATA; + + if (s->main_exec_status.code != CLD_EXITED) + return -EBADE; + + return s->main_exec_status.status; +} + +static const char* service_status_text(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return s->status_text; +} + +static int service_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + bool may_clean_fdstore = false; + Service *s = SERVICE(u); + int r; + + assert(s); + assert(mask != 0); + + if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED)) + return -EBUSY; + + /* Determine if there's anything we could potentially clean */ + r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (mask & EXEC_CLEAN_FDSTORE) + may_clean_fdstore = s->n_fd_store > 0 || s->n_fd_store_max > 0; + + if (strv_isempty(l) && !may_clean_fdstore) + return -EUNATCH; /* Nothing to potentially clean */ + + /* Let's clean the stuff we can clean quickly */ + if (may_clean_fdstore) + service_release_fd_store(s); + + /* If we are done, leave quickly */ + if (strv_isempty(l)) { + if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !s->fd_store) + service_set_state(s, SERVICE_DEAD); + return 0; + } + + /* We need to clean disk stuff. This is slow, hence do it out of process, and change state */ + service_unwatch_control_pid(s); + s->clean_result = SERVICE_SUCCESS; + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + r = service_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to install timer: %m"); + goto fail; + } + + r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m"); + goto fail; + } + + service_set_state(s, SERVICE_CLEANING); + return 0; + +fail: + s->clean_result = SERVICE_FAILURE_RESOURCES; + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + return r; +} + +static int service_can_clean(Unit *u, ExecCleanMask *ret) { + Service *s = SERVICE(u); + ExecCleanMask mask = 0; + int r; + + assert(s); + assert(ret); + + r = exec_context_get_clean_mask(&s->exec_context, &mask); + if (r < 0) + return r; + + if (s->n_fd_store_max > 0) + mask |= EXEC_CLEAN_FDSTORE; + + *ret = mask; + return 0; +} + +static const char *service_finished_job(Unit *u, JobType t, JobResult result) { + if (t == JOB_START && + result == JOB_DONE && + SERVICE(u)->type == SERVICE_ONESHOT) + return "Finished %s."; + + /* Fall back to generic */ + return NULL; +} + +static int service_can_start(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + + /* Make sure we don't enter a busy loop of some kind. */ + r = unit_test_start_limit(u); + if (r < 0) { + service_enter_dead(s, SERVICE_FAILURE_START_LIMIT_HIT, false); + return r; + } + + return 1; +} + +static void service_release_resources(Unit *u) { + Service *s = SERVICE(ASSERT_PTR(u)); + + /* Invoked by the unit state engine, whenever it realizes that unit is dead and there's no job + * anymore for it, and it hence is a good idea to release resources */ + + /* Don't release resources if this is a transitionary failed/dead state + * (i.e. SERVICE_DEAD_BEFORE_AUTO_RESTART/SERVICE_FAILED_BEFORE_AUTO_RESTART), insist on a permanent + * failure state. */ + if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED)) + return; + + log_unit_debug(u, "Releasing resources..."); + + service_release_socket_fd(s); + service_release_stdio_fd(s); + + if (s->fd_store_preserve_mode != EXEC_PRESERVE_YES) + service_release_fd_store(s); + + if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !s->fd_store) + service_set_state(s, SERVICE_DEAD); +} + +static const char* const service_restart_table[_SERVICE_RESTART_MAX] = { + [SERVICE_RESTART_NO] = "no", + [SERVICE_RESTART_ON_SUCCESS] = "on-success", + [SERVICE_RESTART_ON_FAILURE] = "on-failure", + [SERVICE_RESTART_ON_ABNORMAL] = "on-abnormal", + [SERVICE_RESTART_ON_WATCHDOG] = "on-watchdog", + [SERVICE_RESTART_ON_ABORT] = "on-abort", + [SERVICE_RESTART_ALWAYS] = "always", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart); + +static const char* const service_restart_mode_table[_SERVICE_RESTART_MODE_MAX] = { + [SERVICE_RESTART_MODE_NORMAL] = "normal", + [SERVICE_RESTART_MODE_DIRECT] = "direct", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_restart_mode, ServiceRestartMode); + +static const char* const service_type_table[_SERVICE_TYPE_MAX] = { + [SERVICE_SIMPLE] = "simple", + [SERVICE_FORKING] = "forking", + [SERVICE_ONESHOT] = "oneshot", + [SERVICE_DBUS] = "dbus", + [SERVICE_NOTIFY] = "notify", + [SERVICE_NOTIFY_RELOAD] = "notify-reload", + [SERVICE_IDLE] = "idle", + [SERVICE_EXEC] = "exec", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType); + +static const char* const service_exit_type_table[_SERVICE_EXIT_TYPE_MAX] = { + [SERVICE_EXIT_MAIN] = "main", + [SERVICE_EXIT_CGROUP] = "cgroup", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_exit_type, ServiceExitType); + +static const char* const service_exec_command_table[_SERVICE_EXEC_COMMAND_MAX] = { + [SERVICE_EXEC_CONDITION] = "ExecCondition", + [SERVICE_EXEC_START_PRE] = "ExecStartPre", + [SERVICE_EXEC_START] = "ExecStart", + [SERVICE_EXEC_START_POST] = "ExecStartPost", + [SERVICE_EXEC_RELOAD] = "ExecReload", + [SERVICE_EXEC_STOP] = "ExecStop", + [SERVICE_EXEC_STOP_POST] = "ExecStopPost", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_exec_command, ServiceExecCommand); + +static const char* const service_exec_ex_command_table[_SERVICE_EXEC_COMMAND_MAX] = { + [SERVICE_EXEC_CONDITION] = "ExecConditionEx", + [SERVICE_EXEC_START_PRE] = "ExecStartPreEx", + [SERVICE_EXEC_START] = "ExecStartEx", + [SERVICE_EXEC_START_POST] = "ExecStartPostEx", + [SERVICE_EXEC_RELOAD] = "ExecReloadEx", + [SERVICE_EXEC_STOP] = "ExecStopEx", + [SERVICE_EXEC_STOP_POST] = "ExecStopPostEx", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_exec_ex_command, ServiceExecCommand); + +static const char* const notify_state_table[_NOTIFY_STATE_MAX] = { + [NOTIFY_UNKNOWN] = "unknown", + [NOTIFY_READY] = "ready", + [NOTIFY_RELOADING] = "reloading", + [NOTIFY_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP(notify_state, NotifyState); + +static const char* const service_result_table[_SERVICE_RESULT_MAX] = { + [SERVICE_SUCCESS] = "success", + [SERVICE_FAILURE_RESOURCES] = "resources", + [SERVICE_FAILURE_PROTOCOL] = "protocol", + [SERVICE_FAILURE_TIMEOUT] = "timeout", + [SERVICE_FAILURE_EXIT_CODE] = "exit-code", + [SERVICE_FAILURE_SIGNAL] = "signal", + [SERVICE_FAILURE_CORE_DUMP] = "core-dump", + [SERVICE_FAILURE_WATCHDOG] = "watchdog", + [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [SERVICE_FAILURE_OOM_KILL] = "oom-kill", + [SERVICE_SKIP_CONDITION] = "exec-condition", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult); + +static const char* const service_timeout_failure_mode_table[_SERVICE_TIMEOUT_FAILURE_MODE_MAX] = { + [SERVICE_TIMEOUT_TERMINATE] = "terminate", + [SERVICE_TIMEOUT_ABORT] = "abort", + [SERVICE_TIMEOUT_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_timeout_failure_mode, ServiceTimeoutFailureMode); + +const UnitVTable service_vtable = { + .object_size = sizeof(Service), + .exec_context_offset = offsetof(Service, exec_context), + .cgroup_context_offset = offsetof(Service, cgroup_context), + .kill_context_offset = offsetof(Service, kill_context), + .exec_runtime_offset = offsetof(Service, exec_runtime), + + .sections = + "Unit\0" + "Service\0" + "Install\0", + .private_section = "Service", + + .can_transient = true, + .can_delegate = true, + .can_fail = true, + .can_set_managed_oom = true, + + .init = service_init, + .done = service_done, + .load = service_load, + .release_resources = service_release_resources, + + .coldplug = service_coldplug, + + .dump = service_dump, + + .start = service_start, + .stop = service_stop, + .reload = service_reload, + + .can_reload = service_can_reload, + + .clean = service_clean, + .can_clean = service_can_clean, + + .freeze = unit_freeze_vtable_common, + .thaw = unit_thaw_vtable_common, + + .serialize = service_serialize, + .deserialize_item = service_deserialize_item, + + .active_state = service_active_state, + .sub_state_to_string = service_sub_state_to_string, + + .will_restart = service_will_restart, + + .may_gc = service_may_gc, + + .sigchld_event = service_sigchld_event, + + .reset_failed = service_reset_failed, + + .notify_cgroup_empty = service_notify_cgroup_empty_event, + .notify_cgroup_oom = service_notify_cgroup_oom_event, + .notify_message = service_notify_message, + + .main_pid = service_main_pid, + .control_pid = service_control_pid, + + .bus_name_owner_change = service_bus_name_owner_change, + + .bus_set_property = bus_service_set_property, + .bus_commit_properties = bus_service_commit_properties, + + .get_timeout = service_get_timeout, + .get_timeout_start_usec = service_get_timeout_start_usec, + .needs_console = service_needs_console, + .exit_status = service_exit_status, + .status_text = service_status_text, + + .status_message_formats = { + .finished_start_job = { + [JOB_FAILED] = "Failed to start %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Stopped %s.", + [JOB_FAILED] = "Stopped (with error) %s.", + }, + .finished_job = service_finished_job, + }, + + .can_start = service_can_start, + + .notify_plymouth = true, + + .audit_start_message_type = AUDIT_SERVICE_START, + .audit_stop_message_type = AUDIT_SERVICE_STOP, +}; diff --git a/src/core/service.h b/src/core/service.h new file mode 100644 index 0000000..e85302e --- /dev/null +++ b/src/core/service.h @@ -0,0 +1,290 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Service Service; +typedef struct ServiceFDStore ServiceFDStore; + +#include "exit-status.h" +#include "kill.h" +#include "open-file.h" +#include "path.h" +#include "pidref.h" +#include "ratelimit.h" +#include "socket.h" +#include "unit.h" + +typedef enum ServiceRestart { + SERVICE_RESTART_NO, + SERVICE_RESTART_ON_SUCCESS, + SERVICE_RESTART_ON_FAILURE, + SERVICE_RESTART_ON_ABNORMAL, + SERVICE_RESTART_ON_WATCHDOG, + SERVICE_RESTART_ON_ABORT, + SERVICE_RESTART_ALWAYS, + _SERVICE_RESTART_MAX, + _SERVICE_RESTART_INVALID = -EINVAL, +} ServiceRestart; + +typedef enum ServiceType { + SERVICE_SIMPLE, /* we fork and go on right-away (i.e. modern socket activated daemons) */ + SERVICE_FORKING, /* forks by itself (i.e. traditional daemons) */ + SERVICE_ONESHOT, /* we fork and wait until the program finishes (i.e. programs like fsck which run and need to finish before we continue) */ + SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */ + SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */ + SERVICE_NOTIFY_RELOAD, /* just like SERVICE_NOTIFY, but also implements a reload protocol via SIGHUP */ + SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */ + SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */ + _SERVICE_TYPE_MAX, + _SERVICE_TYPE_INVALID = -EINVAL, +} ServiceType; + +typedef enum ServiceExitType { + SERVICE_EXIT_MAIN, /* we consider the main PID when deciding if the service exited */ + SERVICE_EXIT_CGROUP, /* we wait for the last process in the cgroup to exit */ + _SERVICE_EXIT_TYPE_MAX, + _SERVICE_EXIT_TYPE_INVALID = -EINVAL, +} ServiceExitType; + +typedef enum ServiceExecCommand { + SERVICE_EXEC_CONDITION, + SERVICE_EXEC_START_PRE, + SERVICE_EXEC_START, + SERVICE_EXEC_START_POST, + SERVICE_EXEC_RELOAD, + SERVICE_EXEC_STOP, + SERVICE_EXEC_STOP_POST, + _SERVICE_EXEC_COMMAND_MAX, + _SERVICE_EXEC_COMMAND_INVALID = -EINVAL, +} ServiceExecCommand; + +typedef enum NotifyState { + NOTIFY_UNKNOWN, + NOTIFY_READY, + NOTIFY_RELOADING, + NOTIFY_STOPPING, + _NOTIFY_STATE_MAX, + _NOTIFY_STATE_INVALID = -EINVAL, +} NotifyState; + +/* The values of this enum are referenced in man/systemd.exec.xml and src/shared/bus-unit-util.c. + * Update those sources for each change to this enum. */ +typedef enum ServiceResult { + SERVICE_SUCCESS, + SERVICE_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */ + SERVICE_FAILURE_PROTOCOL, + SERVICE_FAILURE_TIMEOUT, + SERVICE_FAILURE_EXIT_CODE, + SERVICE_FAILURE_SIGNAL, + SERVICE_FAILURE_CORE_DUMP, + SERVICE_FAILURE_WATCHDOG, + SERVICE_FAILURE_START_LIMIT_HIT, + SERVICE_FAILURE_OOM_KILL, /* OOM Kill by the Kernel or systemd-oomd */ + SERVICE_SKIP_CONDITION, + _SERVICE_RESULT_MAX, + _SERVICE_RESULT_INVALID = -EINVAL, +} ServiceResult; + +typedef enum ServiceTimeoutFailureMode { + SERVICE_TIMEOUT_TERMINATE, + SERVICE_TIMEOUT_ABORT, + SERVICE_TIMEOUT_KILL, + _SERVICE_TIMEOUT_FAILURE_MODE_MAX, + _SERVICE_TIMEOUT_FAILURE_MODE_INVALID = -EINVAL, +} ServiceTimeoutFailureMode; + +typedef enum ServiceRestartMode { + SERVICE_RESTART_MODE_NORMAL, + SERVICE_RESTART_MODE_DIRECT, + _SERVICE_RESTART_MODE_MAX, + _SERVICE_RESTART_MODE_INVALID = -EINVAL, +} ServiceRestartMode; + +struct ServiceFDStore { + Service *service; + + int fd; + char *fdname; + sd_event_source *event_source; + bool do_poll; + + LIST_FIELDS(ServiceFDStore, fd_store); +}; + +struct Service { + Unit meta; + + ServiceType type; + ServiceExitType exit_type; + ServiceRestart restart; + ServiceRestartMode restart_mode; + ExitStatusSet restart_prevent_status; + ExitStatusSet restart_force_status; + ExitStatusSet success_status; + + /* If set we'll read the main daemon PID from this file */ + char *pid_file; + + usec_t restart_usec; + unsigned restart_steps; + usec_t restart_max_delay_usec; + usec_t timeout_start_usec; + usec_t timeout_stop_usec; + usec_t timeout_abort_usec; + bool timeout_abort_set; + usec_t runtime_max_usec; + usec_t runtime_rand_extra_usec; + ServiceTimeoutFailureMode timeout_start_failure_mode; + ServiceTimeoutFailureMode timeout_stop_failure_mode; + + dual_timestamp watchdog_timestamp; + usec_t watchdog_usec; /* the requested watchdog timeout in the unit file */ + usec_t watchdog_original_usec; /* the watchdog timeout that was in effect when the unit was started, i.e. the timeout the forked off processes currently see */ + usec_t watchdog_override_usec; /* the watchdog timeout requested by the service itself through sd_notify() */ + bool watchdog_override_enable; + sd_event_source *watchdog_event_source; + + ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX]; + + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ServiceState state, deserialized_state; + + /* The exit status of the real main process */ + ExecStatus main_exec_status; + + /* The currently executed control process */ + ExecCommand *control_command; + + /* The currently executed main process, which may be NULL if + * the main process got started via forking mode and not by + * us */ + ExecCommand *main_command; + + /* The ID of the control command currently being executed */ + ServiceExecCommand control_command_id; + + /* Runtime data of the execution context */ + ExecRuntime *exec_runtime; + + PidRef main_pid, control_pid; + + /* if we are a socket activated service instance, store information of the connection/peer/socket */ + int socket_fd; + SocketPeer *socket_peer; + UnitRef accept_socket; + bool socket_fd_selinux_context_net; + + bool permissions_start_only; + bool root_directory_start_only; + bool remain_after_exit; + bool guess_main_pid; + + /* If we shut down, remember why */ + ServiceResult result; + ServiceResult reload_result; + ServiceResult clean_result; + + bool main_pid_known:1; + bool main_pid_alien:1; + bool bus_name_good:1; + bool forbid_restart:1; + bool start_timeout_defined:1; + bool exec_fd_hot:1; + + char *bus_name; + char *bus_name_owner; /* unique name of the current owner */ + + char *status_text; + int status_errno; + + sd_event_source *timer_event_source; + PathSpec *pid_file_pathspec; + + NotifyAccess notify_access; + NotifyAccess notify_access_override; + NotifyState notify_state; + + sd_bus_slot *bus_name_pid_lookup_slot; + + sd_event_source *exec_fd_event_source; + + ServiceFDStore *fd_store; + size_t n_fd_store; + unsigned n_fd_store_max; + ExecPreserveMode fd_store_preserve_mode; + + char *usb_function_descriptors; + char *usb_function_strings; + + int stdin_fd; + int stdout_fd; + int stderr_fd; + + unsigned n_restarts; + bool flush_n_restarts; + + OOMPolicy oom_policy; + + LIST_HEAD(OpenFile, open_files); + + int reload_signal; + usec_t reload_begin_usec; +}; + +static inline usec_t service_timeout_abort_usec(Service *s) { + assert(s); + return s->timeout_abort_set ? s->timeout_abort_usec : s->timeout_stop_usec; +} + +static inline NotifyAccess service_get_notify_access(Service *s) { + assert(s); + return s->notify_access_override < 0 ? s->notify_access : s->notify_access_override; +} + +static inline usec_t service_get_watchdog_usec(Service *s) { + assert(s); + return s->watchdog_override_enable ? s->watchdog_override_usec : s->watchdog_original_usec; +} + +extern const UnitVTable service_vtable; + +int service_set_socket_fd(Service *s, int fd, struct Socket *socket, struct SocketPeer *peer, bool selinux_context_net); +void service_release_socket_fd(Service *s); + +usec_t service_restart_usec_next(Service *s); + +const char* service_restart_to_string(ServiceRestart i) _const_; +ServiceRestart service_restart_from_string(const char *s) _pure_; + +const char* service_restart_mode_to_string(ServiceRestartMode i) _const_; +ServiceRestartMode service_restart_mode_from_string(const char *s) _pure_; + +const char* service_type_to_string(ServiceType i) _const_; +ServiceType service_type_from_string(const char *s) _pure_; + +const char* service_exit_type_to_string(ServiceExitType i) _const_; +ServiceExitType service_exit_type_from_string(const char *s) _pure_; + +const char* service_exec_command_to_string(ServiceExecCommand i) _const_; +ServiceExecCommand service_exec_command_from_string(const char *s) _pure_; + +const char* service_exec_ex_command_to_string(ServiceExecCommand i) _const_; +ServiceExecCommand service_exec_ex_command_from_string(const char *s) _pure_; + +const char* notify_state_to_string(NotifyState i) _const_; +NotifyState notify_state_from_string(const char *s) _pure_; + +const char* service_result_to_string(ServiceResult i) _const_; +ServiceResult service_result_from_string(const char *s) _pure_; + +const char* service_timeout_failure_mode_to_string(ServiceTimeoutFailureMode i) _const_; +ServiceTimeoutFailureMode service_timeout_failure_mode_from_string(const char *s) _pure_; + +DEFINE_CAST(SERVICE, Service); + +#define STATUS_TEXT_MAX (16U*1024U) + +/* Only exported for unit tests */ +int service_deserialize_exec_command(Unit *u, const char *key, const char *value); diff --git a/src/core/show-status.c b/src/core/show-status.c new file mode 100644 index 0000000..606237e --- /dev/null +++ b/src/core/show-status.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "iovec-util.h" +#include "parse-util.h" +#include "show-status.h" +#include "string-table.h" +#include "string-util.h" +#include "terminal-util.h" + +static const char* const show_status_table[_SHOW_STATUS_MAX] = { + [SHOW_STATUS_NO] = "no", + [SHOW_STATUS_ERROR] = "error", + [SHOW_STATUS_AUTO] = "auto", + [SHOW_STATUS_TEMPORARY] = "temporary", + [SHOW_STATUS_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(show_status, ShowStatus, SHOW_STATUS_YES); + +int parse_show_status(const char *v, ShowStatus *ret) { + ShowStatus s; + + assert(ret); + + s = show_status_from_string(v); + if (s < 0 || s == SHOW_STATUS_TEMPORARY) + return -EINVAL; + + *ret = s; + return 0; +} + +int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) { + static const char status_indent[] = " "; /* "[" STATUS "] " */ + _cleanup_free_ char *s = NULL; + _cleanup_close_ int fd = -EBADF; + struct iovec iovec[7] = {}; + int n = 0; + static bool prev_ephemeral; + + assert(format); + + /* This is independent of logging, as status messages are + * optional and go exclusively to the console. */ + + if (vasprintf(&s, format, ap) < 0) + return log_oom(); + + /* Before you ask: yes, on purpose we open/close the console for each status line we write individually. This + * is a good strategy to avoid PID 1 getting killed by the kernel's SAK concept (it doesn't fix this entirely, + * but minimizes the time window the kernel might end up killing PID 1 due to SAK). It also makes things easier + * for us so that we don't have to recover from hangups and suchlike triggered on the console. */ + + fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return fd; + + if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) { + char *e; + size_t emax, sl; + int c; + + c = fd_columns(fd); + if (c <= 0) + c = 80; + + sl = status ? sizeof(status_indent)-1 : 0; + + emax = c - sl - 1; + if (emax < 3) + emax = 3; + + e = ellipsize(s, emax, 50); + if (e) + free_and_replace(s, e); + } + + if (prev_ephemeral) + iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE); + + if (status) { + if (!isempty(status)) { + iovec[n++] = IOVEC_MAKE_STRING("["); + iovec[n++] = IOVEC_MAKE_STRING(status); + iovec[n++] = IOVEC_MAKE_STRING("] "); + } else + iovec[n++] = IOVEC_MAKE_STRING(status_indent); + } + + iovec[n++] = IOVEC_MAKE_STRING(s); + iovec[n++] = IOVEC_MAKE_STRING("\r\n"); /* use CRNL instead of just NL, to be robust towards TTYs in raw mode */ + + if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL)) + iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE); + prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL); + + if (writev(fd, iovec, n) < 0) + return -errno; + + return 0; +} + +int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) { + va_list ap; + int r; + + assert(format); + + va_start(ap, format); + r = status_vprintf(status, flags, format, ap); + va_end(ap); + + return r; +} + +static const char* const status_unit_format_table[_STATUS_UNIT_FORMAT_MAX] = { + [STATUS_UNIT_FORMAT_NAME] = "name", + [STATUS_UNIT_FORMAT_DESCRIPTION] = "description", + [STATUS_UNIT_FORMAT_COMBINED] = "combined", +}; + +DEFINE_STRING_TABLE_LOOKUP(status_unit_format, StatusUnitFormat); diff --git a/src/core/show-status.h b/src/core/show-status.h new file mode 100644 index 0000000..f441223 --- /dev/null +++ b/src/core/show-status.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +/* Manager status */ + +typedef enum ShowStatus { + SHOW_STATUS_NO, /* printing of status is disabled */ + SHOW_STATUS_ERROR, /* only print errors */ + SHOW_STATUS_AUTO, /* disabled but may flip to _TEMPORARY */ + SHOW_STATUS_TEMPORARY, /* enabled temporarily, may flip back to _AUTO */ + SHOW_STATUS_YES, /* printing of status is enabled */ + _SHOW_STATUS_MAX, + _SHOW_STATUS_INVALID = -EINVAL, +} ShowStatus; + +typedef enum ShowStatusFlags { + SHOW_STATUS_ELLIPSIZE = 1 << 0, + SHOW_STATUS_EPHEMERAL = 1 << 1, +} ShowStatusFlags; + +typedef enum StatusUnitFormat { + STATUS_UNIT_FORMAT_NAME, + STATUS_UNIT_FORMAT_DESCRIPTION, + STATUS_UNIT_FORMAT_COMBINED, + _STATUS_UNIT_FORMAT_MAX, + _STATUS_UNIT_FORMAT_INVALID = -EINVAL, +} StatusUnitFormat; + +static inline bool show_status_on(ShowStatus s) { + return IN_SET(s, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES); +} +ShowStatus show_status_from_string(const char *v) _const_; +const char* show_status_to_string(ShowStatus s) _pure_; +int parse_show_status(const char *v, ShowStatus *ret); + +StatusUnitFormat status_unit_format_from_string(const char *v) _const_; +const char* status_unit_format_to_string(StatusUnitFormat s) _pure_; + +int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) _printf_(3,0); +int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) _printf_(3,4); diff --git a/src/core/slice.c b/src/core/slice.c new file mode 100644 index 0000000..fb4f23c --- /dev/null +++ b/src/core/slice.c @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "dbus-slice.h" +#include "dbus-unit.h" +#include "fd-util.h" +#include "log.h" +#include "serialize.h" +#include "slice.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = { + [SLICE_DEAD] = UNIT_INACTIVE, + [SLICE_ACTIVE] = UNIT_ACTIVE +}; + +static void slice_init(Unit *u) { + assert(u); + assert(u->load_state == UNIT_STUB); + + u->ignore_on_isolate = true; +} + +static void slice_set_state(Slice *t, SliceState state) { + SliceState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != old_state) + log_debug("%s changed %s -> %s", + UNIT(t)->id, + slice_state_to_string(old_state), + slice_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static int slice_add_parent_slice(Slice *s) { + Unit *u = UNIT(s); + _cleanup_free_ char *a = NULL; + int r; + + assert(s); + + if (UNIT_GET_SLICE(u)) + return 0; + + r = slice_build_parent_slice(u->id, &a); + if (r <= 0) /* 0 means root slice */ + return r; + + return unit_add_dependency_by_name(u, UNIT_IN_SLICE, a, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int slice_add_default_dependencies(Slice *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Make sure slices are unloaded on shutdown */ + r = unit_add_two_dependencies_by_name( + UNIT(s), + UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int slice_verify(Slice *s) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (!slice_name_is_valid(UNIT(s)->id)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Slice name %s is not valid. Refusing.", UNIT(s)->id); + + r = slice_build_parent_slice(UNIT(s)->id, &parent); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to determine parent slice: %m"); + + /* If recursive errors are to be ignored, the parent slice should not be verified */ + if (UNIT(s)->manager && FLAGS_SET(UNIT(s)->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES)) + return 0; + + if (parent ? !unit_has_name(UNIT_GET_SLICE(UNIT(s)), parent) : !!UNIT_GET_SLICE(UNIT(s))) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Located outside of parent slice. Refusing."); + + return 0; +} + +static int slice_load_root_slice(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + u->perpetual = true; + + /* The root slice is a bit special. For example it is always running and cannot be terminated. Because of its + * special semantics we synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + if (!u->description) + u->description = strdup("Root Slice"); + if (!u->documentation) + u->documentation = strv_new("man:systemd.special(7)"); + + return 1; +} + +static int slice_load_system_slice(Unit *u) { + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return 0; + if (!unit_has_name(u, SPECIAL_SYSTEM_SLICE)) + return 0; + + u->perpetual = true; + + /* The system slice is a bit special. For example it is always running and cannot be terminated. Because of its + * special semantics we synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + if (!u->description) + u->description = strdup("System Slice"); + if (!u->documentation) + u->documentation = strv_new("man:systemd.special(7)"); + + return 1; +} + +static int slice_load(Unit *u) { + Slice *s = SLICE(u); + int r; + + assert(s); + assert(u->load_state == UNIT_STUB); + + r = slice_load_root_slice(u); + if (r < 0) + return r; + r = slice_load_system_slice(u); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin(u, false); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = slice_add_parent_slice(s); + if (r < 0) + return r; + + r = slice_add_default_dependencies(s); + if (r < 0) + return r; + + if (!u->description) { + _cleanup_free_ char *tmp = NULL; + + r = unit_name_to_path(u->id, &tmp); + if (r >= 0) /* Failure is ignored… */ + u->description = strjoin("Slice ", tmp); + } + + return slice_verify(s); +} + +static int slice_coldplug(Unit *u) { + Slice *t = SLICE(u); + + assert(t); + assert(t->state == SLICE_DEAD); + + if (t->deserialized_state != t->state) + slice_set_state(t, t->deserialized_state); + + return 0; +} + +static void slice_dump(Unit *u, FILE *f, const char *prefix) { + Slice *t = SLICE(u); + + assert(t); + assert(f); + + fprintf(f, + "%sSlice State: %s\n", + prefix, slice_state_to_string(t->state)); + + cgroup_context_dump(UNIT(t), f, prefix); +} + +static int slice_start(Unit *u) { + Slice *t = SLICE(u); + int r; + + assert(t); + assert(t->state == SLICE_DEAD); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + (void) unit_realize_cgroup(u); + (void) unit_reset_accounting(u); + + slice_set_state(t, SLICE_ACTIVE); + return 1; +} + +static int slice_stop(Unit *u) { + Slice *t = SLICE(u); + + assert(t); + assert(t->state == SLICE_ACTIVE); + + /* We do not need to destroy the cgroup explicitly, + * unit_notify() will do that for us anyway. */ + + slice_set_state(t, SLICE_DEAD); + return 1; +} + +static int slice_serialize(Unit *u, FILE *f, FDSet *fds) { + Slice *s = SLICE(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", slice_state_to_string(s->state)); + + return 0; +} + +static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Slice *s = SLICE(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + SliceState state; + + state = slice_state_from_string(value); + if (state < 0) + log_debug("Failed to parse state value %s", value); + else + s->deserialized_state = state; + + } else + log_debug("Unknown serialization key '%s'", key); + + return 0; +} + +static UnitActiveState slice_active_state(Unit *u) { + assert(u); + + return state_translation_table[SLICE(u)->state]; +} + +static const char *slice_sub_state_to_string(Unit *u) { + assert(u); + + return slice_state_to_string(SLICE(u)->state); +} + +static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) { + Unit *u; + int r; + + assert(m); + assert(name); + + u = manager_get_unit(m, name); + if (!u) { + r = unit_new_for_name(m, sizeof(Slice), name, &u); + if (r < 0) + return log_error_errno(r, "Failed to allocate the special %s unit: %m", name); + } + + u->perpetual = true; + SLICE(u)->deserialized_state = SLICE_ACTIVE; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); + + if (ret) + *ret = u; + + return 0; +} + +static void slice_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u); + if (r >= 0 && manager_owns_host_root_cgroup(m)) { + Slice *s = SLICE(u); + + /* If we are managing the root cgroup then this means our root slice covers the whole system, which + * means the kernel will track CPU/tasks/memory for us anyway, and it is all available in /proc. Let's + * hence turn accounting on here, so that our APIs to query this data are available. */ + + s->cgroup_context.cpu_accounting = true; + s->cgroup_context.tasks_accounting = true; + s->cgroup_context.memory_accounting = true; + } + + if (MANAGER_IS_SYSTEM(m)) + (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL); +} + +static bool slice_freezer_action_supported_by_children(Unit *s) { + Unit *member; + + assert(s); + + UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) { + + if (member->type == UNIT_SLICE && + !slice_freezer_action_supported_by_children(member)) + return false; + + if (!UNIT_VTABLE(member)->freeze) + return false; + } + + return true; +} + +static int slice_freezer_action(Unit *s, FreezerAction action) { + Unit *member; + int r; + + assert(s); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + if (action == FREEZER_FREEZE && !slice_freezer_action_supported_by_children(s)) { + log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice"); + return 0; + } + + UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) { + if (!member->cgroup_realized) + continue; + + if (action == FREEZER_FREEZE) + r = UNIT_VTABLE(member)->freeze(member); + else if (UNIT_VTABLE(member)->thaw) + r = UNIT_VTABLE(member)->thaw(member); + else + /* Thawing is requested but no corresponding method is available, ignore. */ + r = 0; + if (r < 0) + return r; + } + + return unit_cgroup_freezer_action(s, action); +} + +static int slice_freeze(Unit *s) { + assert(s); + + return slice_freezer_action(s, FREEZER_FREEZE); +} + +static int slice_thaw(Unit *s) { + assert(s); + + return slice_freezer_action(s, FREEZER_THAW); +} + +static bool slice_can_freeze(Unit *s) { + assert(s); + + return slice_freezer_action_supported_by_children(s); +} + +const UnitVTable slice_vtable = { + .object_size = sizeof(Slice), + .cgroup_context_offset = offsetof(Slice, cgroup_context), + + .sections = + "Unit\0" + "Slice\0" + "Install\0", + .private_section = "Slice", + + .can_transient = true, + .can_set_managed_oom = true, + + .init = slice_init, + .load = slice_load, + + .coldplug = slice_coldplug, + + .dump = slice_dump, + + .start = slice_start, + .stop = slice_stop, + + .freeze = slice_freeze, + .thaw = slice_thaw, + .can_freeze = slice_can_freeze, + + .serialize = slice_serialize, + .deserialize_item = slice_deserialize_item, + + .active_state = slice_active_state, + .sub_state_to_string = slice_sub_state_to_string, + + .bus_set_property = bus_slice_set_property, + .bus_commit_properties = bus_slice_commit_properties, + + .enumerate_perpetual = slice_enumerate_perpetual, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Created slice %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Removed slice %s.", + }, + }, +}; diff --git a/src/core/slice.h b/src/core/slice.h new file mode 100644 index 0000000..e2f9274 --- /dev/null +++ b/src/core/slice.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +typedef struct Slice Slice; + +struct Slice { + Unit meta; + + SliceState state, deserialized_state; + + CGroupContext cgroup_context; +}; + +extern const UnitVTable slice_vtable; + +DEFINE_CAST(SLICE, Slice); diff --git a/src/core/smack-setup.c b/src/core/smack-setup.c new file mode 100644 index 0000000..7ea902b --- /dev/null +++ b/src/core/smack-setup.c @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation + Authors: + Nathaniel Chen +***/ + +#include +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "smack-setup.h" +#include "string-util.h" + +#if ENABLE_SMACK + +static int fdopen_unlocked_at(int dfd, const char *dir, const char *name, int *status, FILE **ret_file) { + int fd, r; + FILE *f; + + fd = openat(dfd, name, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (*status == 0) + *status = -errno; + + return log_warning_errno(errno, "Failed to open \"%s/%s\": %m", dir, name); + } + + r = fdopen_unlocked(fd, "r", &f); + if (r < 0) { + if (*status == 0) + *status = r; + + safe_close(fd); + return log_error_errno(r, "Failed to open \"%s/%s\": %m", dir, name); + } + + *ret_file = f; + return 0; +} + +static int write_access2_rules(const char *srcdir) { + _cleanup_close_ int load2_fd = -EBADF, change_fd = -EBADF; + _cleanup_closedir_ DIR *dir = NULL; + int dfd = -EBADF, r = 0; + + load2_fd = open("/sys/fs/smackfs/load2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (load2_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/load2': %m"); + return -errno; /* negative error */ + } + + change_fd = open("/sys/fs/smackfs/change-rule", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (change_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/change-rule': %m"); + return -errno; /* negative error */ + } + + /* write rules to load2 or change-rule from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + _cleanup_fclose_ FILE *policy = NULL; + + if (!dirent_is_file(entry)) + continue; + + if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0) + continue; + + /* load2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL, *sbj = NULL, *obj = NULL, *acc1 = NULL, *acc2 = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name); + if (q == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, buf[0])) + continue; + + /* if 3 args -> load rule : subject object access1 */ + /* if 4 args -> change rule : subject object access1 access2 */ + if (sscanf(buf, "%ms %ms %ms %ms", &sbj, &obj, &acc1, &acc2) < 3) { + log_error_errno(errno, "Failed to parse rule '%s' in '%s', ignoring.", buf, entry->d_name); + continue; + } + + if (write(isempty(acc2) ? load2_fd : change_fd, buf, strlen(buf)) < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to write '%s' to '%s' in '%s': %m", + buf, isempty(acc2) ? "/sys/fs/smackfs/load2" : "/sys/fs/smackfs/change-rule", entry->d_name); + } + } + } + + return r; +} + +static int write_cipso2_rules(const char *srcdir) { + _cleanup_close_ int cipso2_fd = -EBADF; + _cleanup_closedir_ DIR *dir = NULL; + int dfd = -EBADF, r = 0; + + cipso2_fd = open("/sys/fs/smackfs/cipso2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (cipso2_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/cipso2': %m"); + return -errno; /* negative error */ + } + + /* write rules to cipso2 from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + _cleanup_fclose_ FILE *policy = NULL; + + if (!dirent_is_file(entry)) + continue; + + if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0) + continue; + + /* cipso2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name); + if (q == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, buf[0])) + continue; + + if (write(cipso2_fd, buf, strlen(buf)) < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to write '%s' to '/sys/fs/smackfs/cipso2' in '%s': %m", + buf, entry->d_name); + break; + } + } + } + + return r; +} + +static int write_netlabel_rules(const char *srcdir) { + _cleanup_fclose_ FILE *dst = NULL; + _cleanup_closedir_ DIR *dir = NULL; + int dfd = -EBADF, r = 0; + + dst = fopen("/sys/fs/smackfs/netlabel", "we"); + if (!dst) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open /sys/fs/smackfs/netlabel: %m"); + return -errno; /* negative error */ + } + + /* write rules to dst from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir %s: %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + _cleanup_fclose_ FILE *policy = NULL; + + if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0) + continue; + + /* load2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from %s: %m", entry->d_name); + if (q == 0) + break; + + if (!fputs(buf, dst)) { + if (r == 0) + r = -EINVAL; + log_error_errno(errno, "Failed to write line to /sys/fs/smackfs/netlabel: %m"); + break; + } + q = fflush_and_check(dst); + if (q < 0) { + if (r == 0) + r = q; + log_error_errno(q, "Failed to flush writes to /sys/fs/smackfs/netlabel: %m"); + break; + } + } + } + + return r; +} + +static int write_onlycap_list(void) { + _cleanup_close_ int onlycap_fd = -EBADF; + _cleanup_free_ char *list = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t len = 0; + int r; + + f = fopen("/etc/smack/onlycap", "re"); + if (!f) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to read '/etc/smack/onlycap': %m"); + + return errno == ENOENT ? ENOENT : -errno; + } + + for (;;) { + _cleanup_free_ char *buf = NULL; + size_t l; + + r = read_line(f, LONG_LINE_MAX, &buf); + if (r < 0) + return log_error_errno(r, "Failed to read line from /etc/smack/onlycap: %m"); + if (r == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, *buf)) + continue; + + l = strlen(buf); + if (!GREEDY_REALLOC(list, len + l + 1)) + return log_oom(); + + stpcpy(list + len, buf)[0] = ' '; + len += l + 1; + } + + if (len == 0) + return 0; + + list[len - 1] = 0; + + onlycap_fd = open("/sys/fs/smackfs/onlycap", O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (onlycap_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/onlycap': %m"); + return -errno; /* negative error */ + } + + r = write(onlycap_fd, list, len); + if (r < 0) + return log_error_errno(errno, "Failed to write onlycap list(%s) to '/sys/fs/smackfs/onlycap': %m", list); + + return 0; +} + +#endif + +int mac_smack_setup(bool *loaded_policy) { + +#if ENABLE_SMACK + + int r; + + assert(loaded_policy); + + r = write_access2_rules("/etc/smack/accesses.d/"); + switch (r) { + case -ENOENT: + log_debug("Smack is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack access rules directory '/etc/smack/accesses.d/' not found"); + return 0; + case 0: + log_info("Successfully loaded Smack policies."); + break; + default: + log_warning_errno(r, "Failed to load Smack access rules, ignoring: %m"); + return 0; + } + +#if HAVE_SMACK_RUN_LABEL + r = write_string_file("/proc/self/attr/current", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK label \"" SMACK_RUN_LABEL "\" on self: %m"); + r = write_string_file("/sys/fs/smackfs/ambient", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK ambient label \"" SMACK_RUN_LABEL "\": %m"); + r = write_string_file("/sys/fs/smackfs/netlabel", + "0.0.0.0/0 " SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK netlabel rule \"0.0.0.0/0 " SMACK_RUN_LABEL "\": %m"); + r = write_string_file("/sys/fs/smackfs/netlabel", "127.0.0.1 -CIPSO", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK netlabel rule \"127.0.0.1 -CIPSO\": %m"); +#endif + + r = write_cipso2_rules("/etc/smack/cipso.d/"); + switch (r) { + case -ENOENT: + log_debug("Smack/CIPSO is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack/CIPSO access rules directory '/etc/smack/cipso.d/' not found"); + break; + case 0: + log_info("Successfully loaded Smack/CIPSO policies."); + break; + default: + log_warning_errno(r, "Failed to load Smack/CIPSO access rules, ignoring: %m"); + break; + } + + r = write_netlabel_rules("/etc/smack/netlabel.d/"); + switch (r) { + case -ENOENT: + log_debug("Smack/CIPSO is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack network host rules directory '/etc/smack/netlabel.d/' not found"); + break; + case 0: + log_info("Successfully loaded Smack network host rules."); + break; + default: + log_warning_errno(r, "Failed to load Smack network host rules: %m, ignoring."); + break; + } + + r = write_onlycap_list(); + switch (r) { + case -ENOENT: + log_debug("Smack is not enabled in the kernel."); + break; + case ENOENT: + log_debug("Smack onlycap list file '/etc/smack/onlycap' not found"); + break; + case 0: + log_info("Successfully wrote Smack onlycap list."); + break; + default: + return log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Failed to write Smack onlycap list: %m"), + "MESSAGE_ID=" SD_MESSAGE_SMACK_FAILED_WRITE_STR); + } + + *loaded_policy = true; + +#endif + + return 0; +} diff --git a/src/core/smack-setup.h b/src/core/smack-setup.h new file mode 100644 index 0000000..d29370d --- /dev/null +++ b/src/core/smack-setup.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation + Authors: + Nathaniel Chen +***/ + +int mac_smack_setup(bool *loaded_policy); diff --git a/src/core/socket.c b/src/core/socket.c new file mode 100644 index 0000000..388be62 --- /dev/null +++ b/src/core/socket.c @@ -0,0 +1,3617 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-error.h" +#include "bus-util.h" +#include "chase.h" +#include "constants.h" +#include "copy.h" +#include "dbus-socket.h" +#include "dbus-unit.h" +#include "errno-list.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "in-addr-util.h" +#include "io-util.h" +#include "ip-protocol-list.h" +#include "label-util.h" +#include "log.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "serialize.h" +#include "service.h" +#include "signal-util.h" +#include "smack-util.h" +#include "socket.h" +#include "socket-netlink.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" + +struct SocketPeer { + unsigned n_ref; + + Socket *socket; + union sockaddr_union peer; + socklen_t peer_salen; +}; + +static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = { + [SOCKET_DEAD] = UNIT_INACTIVE, + [SOCKET_START_PRE] = UNIT_ACTIVATING, + [SOCKET_START_CHOWN] = UNIT_ACTIVATING, + [SOCKET_START_POST] = UNIT_ACTIVATING, + [SOCKET_LISTENING] = UNIT_ACTIVE, + [SOCKET_RUNNING] = UNIT_ACTIVE, + [SOCKET_STOP_PRE] = UNIT_DEACTIVATING, + [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_STOP_POST] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_FAILED] = UNIT_FAILED, + [SOCKET_CLEANING] = UNIT_MAINTENANCE, +}; + +static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static void flush_ports(Socket *s); + +static void socket_init(Unit *u) { + Socket *s = SOCKET(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->backlog = SOMAXCONN_DELUXE; + s->timeout_usec = u->manager->defaults.timeout_start_usec; + s->directory_mode = 0755; + s->socket_mode = 0666; + + s->max_connections = 64; + + s->priority = -1; + s->ip_tos = -1; + s->ip_ttl = -1; + s->mark = -1; + + s->exec_context.std_output = u->manager->defaults.std_output; + s->exec_context.std_error = u->manager->defaults.std_error; + + s->control_pid = PIDREF_NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + s->trigger_limit = RATELIMIT_OFF; + + s->poll_limit_interval = USEC_INFINITY; + s->poll_limit_burst = UINT_MAX; +} + +static void socket_unwatch_control_pid(Socket *s) { + assert(s); + + if (!pidref_is_set(&s->control_pid)) + return; + + unit_unwatch_pidref(UNIT(s), &s->control_pid); + pidref_done(&s->control_pid); +} + +static void socket_cleanup_fd_list(SocketPort *p) { + assert(p); + + close_many(p->auxiliary_fds, p->n_auxiliary_fds); + p->auxiliary_fds = mfree(p->auxiliary_fds); + p->n_auxiliary_fds = 0; +} + +SocketPort *socket_port_free(SocketPort *p) { + if (!p) + return NULL; + + sd_event_source_unref(p->event_source); + + socket_cleanup_fd_list(p); + safe_close(p->fd); + free(p->path); + + return mfree(p); +} + +void socket_free_ports(Socket *s) { + assert(s); + + LIST_CLEAR(port, s->ports, socket_port_free); +} + +static void socket_done(Unit *u) { + Socket *s = SOCKET(u); + SocketPeer *p; + + assert(s); + + socket_free_ports(s); + + while ((p = set_steal_first(s->peers_by_address))) + p->socket = NULL; + + s->peers_by_address = set_free(s->peers_by_address); + + s->exec_runtime = exec_runtime_free(s->exec_runtime); + exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); + s->control_command = NULL; + + socket_unwatch_control_pid(s); + + unit_ref_unset(&s->service); + + s->tcp_congestion = mfree(s->tcp_congestion); + s->bind_to_device = mfree(s->bind_to_device); + + s->smack = mfree(s->smack); + s->smack_ip_in = mfree(s->smack_ip_in); + s->smack_ip_out = mfree(s->smack_ip_out); + + strv_free(s->symlinks); + + s->user = mfree(s->user); + s->group = mfree(s->group); + + s->fdname = mfree(s->fdname); + + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); +} + +static int socket_arm_timer(Socket *s, bool relative, usec_t usec) { + assert(s); + + return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, socket_dispatch_timer); +} + +static bool have_non_accept_socket(Socket *s) { + assert(s); + + if (!s->accept) + return true; + + LIST_FOREACH(port, p, s->ports) { + + if (p->type != SOCKET_SOCKET) + return true; + + if (!socket_address_can_accept(&p->address)) + return true; + } + + return false; +} + +static int socket_add_mount_dependencies(Socket *s) { + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + const char *path = NULL; + + if (p->type == SOCKET_SOCKET) + path = socket_address_get_path(&p->address); + else if (IN_SET(p->type, SOCKET_FIFO, SOCKET_SPECIAL, SOCKET_USB_FUNCTION)) + path = p->path; + + if (!path) + continue; + + r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + return 0; +} + +static int socket_add_device_dependencies(Socket *s) { + char *t; + + assert(s); + + if (!s->bind_to_device || streq(s->bind_to_device, "lo")) + return 0; + + t = strjoina("/sys/subsystem/net/devices/", s->bind_to_device); + return unit_add_node_dependency(UNIT(s), t, UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE); +} + +static int socket_add_default_dependencies(Socket *s) { + int r; + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SOCKETS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static bool socket_has_exec(Socket *s) { + unsigned i; + assert(s); + + for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++) + if (s->exec_command[i]) + return true; + + return false; +} + +static int socket_add_extras(Socket *s) { + Unit *u = UNIT(s); + int r; + + assert(s); + + /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit + * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept() + * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly + * and reliably. This is different for Accept=no, where the spawned service has to take the incoming traffic + * off the queues, which it might not necessarily do. Moreover, while Accept=no services are supposed to + * process whatever is queued in one go, and thus should normally never have to be started frequently. This is + * different for Accept=yes where each connection is processed by a new service instance, and thus frequent + * service starts are typical. + * + * For the poll limit we follow a similar rule, but use 3/4th of the trigger limit parameters, to + * trigger this earlier. */ + + if (s->trigger_limit.interval == USEC_INFINITY) + s->trigger_limit.interval = 2 * USEC_PER_SEC; + if (s->trigger_limit.burst == UINT_MAX) + s->trigger_limit.burst = s->accept ? 200 : 20; + + if (s->poll_limit_interval == USEC_INFINITY) + s->poll_limit_interval = 2 * USEC_PER_SEC; + if (s->poll_limit_burst == UINT_MAX) + s->poll_limit_burst = s->accept ? 150 : 15; + + if (have_non_accept_socket(s)) { + + if (!UNIT_DEREF(s->service)) { + Unit *x; + + r = unit_load_related_unit(u, ".service", &x); + if (r < 0) + return r; + + unit_ref_set(&s->service, u, x); + } + + r = unit_add_two_dependencies(u, UNIT_BEFORE, UNIT_TRIGGERS, UNIT_DEREF(s->service), true, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + } + + r = socket_add_mount_dependencies(s); + if (r < 0) + return r; + + r = socket_add_device_dependencies(s); + if (r < 0) + return r; + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + if (socket_has_exec(s)) { + r = unit_add_exec_dependencies(u, &s->exec_context); + if (r < 0) + return r; + } + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = socket_add_default_dependencies(s); + if (r < 0) + return r; + + return 0; +} + +static const char *socket_find_symlink_target(Socket *s) { + const char *found = NULL; + + LIST_FOREACH(port, p, s->ports) { + const char *f = NULL; + + switch (p->type) { + + case SOCKET_FIFO: + f = p->path; + break; + + case SOCKET_SOCKET: + f = socket_address_get_path(&p->address); + break; + + default: + break; + } + + if (f) { + if (found) + return NULL; + + found = f; + } + } + + return found; +} + +static int socket_verify(Socket *s) { + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (!s->ports) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has no Listen setting (ListenStream=, ListenDatagram=, ListenFIFO=, ...). Refusing."); + + if (s->accept && have_non_accept_socket(s)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit configured for accepting sockets, but sockets are non-accepting. Refusing."); + + if (s->accept && s->max_connections <= 0) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "MaxConnection= setting too small. Refusing."); + + if (s->accept && UNIT_DEREF(s->service)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Explicit service configuration for accepting socket units not supported. Refusing."); + + if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing."); + + if (!strv_isempty(s->symlinks) && !socket_find_symlink_target(s)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has symlinks set but none or more than one node in the file system. Refusing."); + + return 0; +} + +static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) { + assert(s); + + if (s->peer.sa.sa_family == AF_INET) + siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state); + else if (s->peer.sa.sa_family == AF_INET6) + siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state); + else if (s->peer.sa.sa_family == AF_VSOCK) + siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state); + else + assert_not_reached(); +} + +static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) { + int r; + + r = CMP(x->peer.sa.sa_family, y->peer.sa.sa_family); + if (r != 0) + return r; + + switch (x->peer.sa.sa_family) { + case AF_INET: + return memcmp(&x->peer.in.sin_addr, &y->peer.in.sin_addr, sizeof(x->peer.in.sin_addr)); + case AF_INET6: + return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr)); + case AF_VSOCK: + return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid); + } + assert_not_reached(); +} + +DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func); + +static int socket_load(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = socket_add_extras(s); + if (r < 0) + return r; + + return socket_verify(s); +} + +static SocketPeer *socket_peer_new(void) { + SocketPeer *p; + + p = new(SocketPeer, 1); + if (!p) + return NULL; + + *p = (SocketPeer) { + .n_ref = 1, + }; + return p; +} + +static SocketPeer *socket_peer_free(SocketPeer *p) { + assert(p); + + if (p->socket) + set_remove(p->socket->peers_by_address, p); + + return mfree(p); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free); + +int socket_acquire_peer(Socket *s, int fd, SocketPeer **ret) { + _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL; + SocketPeer sa = { + .peer_salen = sizeof(union sockaddr_union), + }, *i; + int r; + + assert(fd >= 0); + assert(s); + assert(ret); + + if (getpeername(fd, &sa.peer.sa, &sa.peer_salen) < 0) + return log_unit_error_errno(UNIT(s), errno, "getpeername() failed: %m"); + + if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + *ret = NULL; + return 0; + } + + i = set_get(s->peers_by_address, &sa); + if (i) { + *ret = socket_peer_ref(i); + return 1; + } + + remote = socket_peer_new(); + if (!remote) + return log_oom(); + + remote->peer = sa.peer; + remote->peer_salen = sa.peer_salen; + + r = set_ensure_put(&s->peers_by_address, &peer_address_hash_ops, remote); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to insert peer info into hash table: %m"); + + remote->socket = s; + + *ret = TAKE_PTR(remote); + return 1; +} + +static const char* listen_lookup(int family, int type) { + + if (family == AF_NETLINK) + return "ListenNetlink"; + + if (type == SOCK_STREAM) + return "ListenStream"; + else if (type == SOCK_DGRAM) + return "ListenDatagram"; + else if (type == SOCK_SEQPACKET) + return "ListenSequentialPacket"; + + assert_not_reached(); + return NULL; +} + +static void socket_dump(Unit *u, FILE *f, const char *prefix) { + Socket *s = SOCKET(u); + const char *prefix2, *str; + + assert(s); + assert(f); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%sSocket State: %s\n" + "%sResult: %s\n" + "%sClean Result: %s\n" + "%sBindIPv6Only: %s\n" + "%sBacklog: %u\n" + "%sSocketMode: %04o\n" + "%sDirectoryMode: %04o\n" + "%sKeepAlive: %s\n" + "%sNoDelay: %s\n" + "%sFreeBind: %s\n" + "%sTransparent: %s\n" + "%sBroadcast: %s\n" + "%sPassCredentials: %s\n" + "%sPassSecurity: %s\n" + "%sPassPacketInfo: %s\n" + "%sTCPCongestion: %s\n" + "%sRemoveOnStop: %s\n" + "%sWritable: %s\n" + "%sFileDescriptorName: %s\n" + "%sSELinuxContextFromNet: %s\n", + prefix, socket_state_to_string(s->state), + prefix, socket_result_to_string(s->result), + prefix, socket_result_to_string(s->clean_result), + prefix, socket_address_bind_ipv6_only_to_string(s->bind_ipv6_only), + prefix, s->backlog, + prefix, s->socket_mode, + prefix, s->directory_mode, + prefix, yes_no(s->keep_alive), + prefix, yes_no(s->no_delay), + prefix, yes_no(s->free_bind), + prefix, yes_no(s->transparent), + prefix, yes_no(s->broadcast), + prefix, yes_no(s->pass_cred), + prefix, yes_no(s->pass_sec), + prefix, yes_no(s->pass_pktinfo), + prefix, strna(s->tcp_congestion), + prefix, yes_no(s->remove_on_stop), + prefix, yes_no(s->writable), + prefix, socket_fdname(s), + prefix, yes_no(s->selinux_context_from_net)); + + if (s->timestamping != SOCKET_TIMESTAMPING_OFF) + fprintf(f, + "%sTimestamping: %s\n", + prefix, socket_timestamping_to_string(s->timestamping)); + + if (pidref_is_set(&s->control_pid)) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid.pid); + + if (s->bind_to_device) + fprintf(f, + "%sBindToDevice: %s\n", + prefix, s->bind_to_device); + + if (s->accept) + fprintf(f, + "%sAccepted: %u\n" + "%sNConnections: %u\n" + "%sMaxConnections: %u\n" + "%sMaxConnectionsPerSource: %u\n", + prefix, s->n_accepted, + prefix, s->n_connections, + prefix, s->max_connections, + prefix, s->max_connections_per_source); + else + fprintf(f, + "%sFlushPending: %s\n", + prefix, yes_no(s->flush_pending)); + + + if (s->priority >= 0) + fprintf(f, + "%sPriority: %i\n", + prefix, s->priority); + + if (s->receive_buffer > 0) + fprintf(f, + "%sReceiveBuffer: %zu\n", + prefix, s->receive_buffer); + + if (s->send_buffer > 0) + fprintf(f, + "%sSendBuffer: %zu\n", + prefix, s->send_buffer); + + if (s->ip_tos >= 0) + fprintf(f, + "%sIPTOS: %i\n", + prefix, s->ip_tos); + + if (s->ip_ttl >= 0) + fprintf(f, + "%sIPTTL: %i\n", + prefix, s->ip_ttl); + + if (s->pipe_size > 0) + fprintf(f, + "%sPipeSize: %zu\n", + prefix, s->pipe_size); + + if (s->mark >= 0) + fprintf(f, + "%sMark: %i\n", + prefix, s->mark); + + if (s->mq_maxmsg > 0) + fprintf(f, + "%sMessageQueueMaxMessages: %li\n", + prefix, s->mq_maxmsg); + + if (s->mq_msgsize > 0) + fprintf(f, + "%sMessageQueueMessageSize: %li\n", + prefix, s->mq_msgsize); + + if (s->reuse_port) + fprintf(f, + "%sReusePort: %s\n", + prefix, yes_no(s->reuse_port)); + + if (s->smack) + fprintf(f, + "%sSmackLabel: %s\n", + prefix, s->smack); + + if (s->smack_ip_in) + fprintf(f, + "%sSmackLabelIPIn: %s\n", + prefix, s->smack_ip_in); + + if (s->smack_ip_out) + fprintf(f, + "%sSmackLabelIPOut: %s\n", + prefix, s->smack_ip_out); + + if (!isempty(s->user) || !isempty(s->group)) + fprintf(f, + "%sSocketUser: %s\n" + "%sSocketGroup: %s\n", + prefix, strna(s->user), + prefix, strna(s->group)); + + if (timestamp_is_set(s->keep_alive_time)) + fprintf(f, + "%sKeepAliveTimeSec: %s\n", + prefix, FORMAT_TIMESPAN(s->keep_alive_time, USEC_PER_SEC)); + + if (s->keep_alive_interval > 0) + fprintf(f, + "%sKeepAliveIntervalSec: %s\n", + prefix, FORMAT_TIMESPAN(s->keep_alive_interval, USEC_PER_SEC)); + + if (s->keep_alive_cnt > 0) + fprintf(f, + "%sKeepAliveProbes: %u\n", + prefix, s->keep_alive_cnt); + + if (s->defer_accept > 0) + fprintf(f, + "%sDeferAcceptSec: %s\n", + prefix, FORMAT_TIMESPAN(s->defer_accept, USEC_PER_SEC)); + + LIST_FOREACH(port, p, s->ports) { + + switch (p->type) { + case SOCKET_SOCKET: { + _cleanup_free_ char *k = NULL; + int r; + + r = socket_address_print(&p->address, &k); + if (r < 0) { + errno = -r; + fprintf(f, "%s%s: %m\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type)); + } else + fprintf(f, "%s%s: %s\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type), k); + break; + } + case SOCKET_SPECIAL: + fprintf(f, "%sListenSpecial: %s\n", prefix, p->path); + break; + case SOCKET_USB_FUNCTION: + fprintf(f, "%sListenUSBFunction: %s\n", prefix, p->path); + break; + case SOCKET_MQUEUE: + fprintf(f, "%sListenMessageQueue: %s\n", prefix, p->path); + break; + default: + fprintf(f, "%sListenFIFO: %s\n", prefix, p->path); + } + } + + fprintf(f, + "%sTriggerLimitIntervalSec: %s\n" + "%sTriggerLimitBurst: %u\n" + "%sPollLimitIntervalSec: %s\n" + "%sPollLimitBurst: %u\n", + prefix, FORMAT_TIMESPAN(s->trigger_limit.interval, USEC_PER_SEC), + prefix, s->trigger_limit.burst, + prefix, FORMAT_TIMESPAN(s->poll_limit_interval, USEC_PER_SEC), + prefix, s->poll_limit_burst); + + str = ip_protocol_to_name(s->socket_protocol); + if (str) + fprintf(f, "%sSocketProtocol: %s\n", prefix, str); + + if (!strv_isempty(s->symlinks)) { + fprintf(f, "%sSymlinks:", prefix); + STRV_FOREACH(q, s->symlinks) + fprintf(f, " %s", *q); + + fprintf(f, "\n"); + } + + fprintf(f, + "%sTimeoutSec: %s\n", + prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC)); + + exec_context_dump(&s->exec_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); + + for (SocketExecCommand c = 0; c < _SOCKET_EXEC_COMMAND_MAX; c++) { + if (!s->exec_command[c]) + continue; + + fprintf(f, "%s-> %s:\n", + prefix, socket_exec_command_to_string(c)); + + exec_command_dump_list(s->exec_command[c], f, prefix2); + } + + cgroup_context_dump(UNIT(s), f, prefix); +} + +static int instance_from_socket(int fd, unsigned nr, char **instance) { + socklen_t l; + char *r; + union sockaddr_union local, remote; + + assert(fd >= 0); + assert(instance); + + l = sizeof(local); + if (getsockname(fd, &local.sa, &l) < 0) + return -errno; + + l = sizeof(remote); + if (getpeername(fd, &remote.sa, &l) < 0) + return -errno; + + switch (local.sa.sa_family) { + + case AF_INET: { + uint32_t + a = be32toh(local.in.sin_addr.s_addr), + b = be32toh(remote.in.sin_addr.s_addr); + + if (asprintf(&r, + "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u", + nr, + a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF, + be16toh(local.in.sin_port), + b >> 24, (b >> 16) & 0xFF, (b >> 8) & 0xFF, b & 0xFF, + be16toh(remote.in.sin_port)) < 0) + return -ENOMEM; + + break; + } + + case AF_INET6: { + static const unsigned char ipv4_prefix[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF + }; + + if (memcmp(&local.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0 && + memcmp(&remote.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) { + const uint8_t + *a = local.in6.sin6_addr.s6_addr+12, + *b = remote.in6.sin6_addr.s6_addr+12; + + if (asprintf(&r, + "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u", + nr, + a[0], a[1], a[2], a[3], + be16toh(local.in6.sin6_port), + b[0], b[1], b[2], b[3], + be16toh(remote.in6.sin6_port)) < 0) + return -ENOMEM; + } else { + if (asprintf(&r, + "%u-%s:%u-%s:%u", + nr, + IN6_ADDR_TO_STRING(&local.in6.sin6_addr), + be16toh(local.in6.sin6_port), + IN6_ADDR_TO_STRING(&remote.in6.sin6_addr), + be16toh(remote.in6.sin6_port)) < 0) + return -ENOMEM; + } + + break; + } + + case AF_UNIX: { + struct ucred ucred; + int k; + + k = getpeercred(fd, &ucred); + if (k >= 0) { + if (asprintf(&r, + "%u-"PID_FMT"-"UID_FMT, + nr, ucred.pid, ucred.uid) < 0) + return -ENOMEM; + } else if (k == -ENODATA) { + /* This handles the case where somebody is + * connecting from another pid/uid namespace + * (e.g. from outside of our container). */ + if (asprintf(&r, + "%u-unknown", + nr) < 0) + return -ENOMEM; + } else + return k; + + break; + } + + case AF_VSOCK: + if (asprintf(&r, + "%u-%u:%u-%u:%u", + nr, + local.vm.svm_cid, local.vm.svm_port, + remote.vm.svm_cid, remote.vm.svm_port) < 0) + return -ENOMEM; + + break; + + default: + assert_not_reached(); + } + + *instance = r; + return 0; +} + +static void socket_close_fds(Socket *s) { + assert(s); + + LIST_FOREACH(port, p, s->ports) { + bool was_open; + + was_open = p->fd >= 0; + + p->event_source = sd_event_source_disable_unref(p->event_source); + p->fd = safe_close(p->fd); + socket_cleanup_fd_list(p); + + /* One little note: we should normally not delete any sockets in the file system here! After all some + * other process we spawned might still have a reference of this fd and wants to continue to use + * it. Therefore we normally delete sockets in the file system before we create a new one, not after we + * stopped using one! That all said, if the user explicitly requested this, we'll delete them here + * anyway, but only then. */ + + if (!was_open || !s->remove_on_stop) + continue; + + switch (p->type) { + + case SOCKET_FIFO: + (void) unlink(p->path); + break; + + case SOCKET_MQUEUE: + (void) mq_unlink(p->path); + break; + + case SOCKET_SOCKET: + (void) socket_address_unlink(&p->address); + break; + + default: + break; + } + } + + if (s->remove_on_stop) + STRV_FOREACH(i, s->symlinks) + (void) unlink(*i); + + /* Note that we don't return NULL here, since s has not been freed. */ +} + +static void socket_apply_socket_options(Socket *s, SocketPort *p, int fd) { + int r; + + assert(s); + assert(p); + assert(fd >= 0); + + if (s->keep_alive) { + r = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_KEEPALIVE failed: %m"); + } + + if (timestamp_is_set(s->keep_alive_time)) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPIDLE, s->keep_alive_time / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPIDLE failed: %m"); + } + + if (s->keep_alive_interval > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPINTVL, s->keep_alive_interval / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPINTVL failed: %m"); + } + + if (s->keep_alive_cnt > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPCNT, s->keep_alive_cnt); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPCNT failed: %m"); + } + + if (s->defer_accept > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_DEFER_ACCEPT, s->defer_accept / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_DEFER_ACCEPT failed: %m"); + } + + if (s->no_delay) { + if (s->socket_protocol == IPPROTO_SCTP) { + r = setsockopt_int(fd, SOL_SCTP, SCTP_NODELAY, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SCTP_NODELAY failed: %m"); + } else { + r = setsockopt_int(fd, SOL_TCP, TCP_NODELAY, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_NODELAY failed: %m"); + } + } + + if (s->broadcast) { + r = setsockopt_int(fd, SOL_SOCKET, SO_BROADCAST, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_BROADCAST failed: %m"); + } + + if (s->pass_cred) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PASSCRED failed: %m"); + } + + if (s->pass_sec) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PASSSEC failed: %m"); + } + + if (s->pass_pktinfo) { + r = socket_set_recvpktinfo(fd, socket_address_family(&p->address), true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to enable packet info socket option: %m"); + } + + if (s->timestamping != SOCKET_TIMESTAMPING_OFF) { + r = setsockopt_int(fd, SOL_SOCKET, + s->timestamping == SOCKET_TIMESTAMPING_NS ? SO_TIMESTAMPNS : SO_TIMESTAMP, + true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to enable timestamping socket option, ignoring: %m"); + } + + if (s->priority >= 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PRIORITY, s->priority); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PRIORITY failed: %m"); + } + + if (s->receive_buffer > 0) { + r = fd_set_rcvbuf(fd, s->receive_buffer, false); + if (r < 0) + log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "SO_RCVBUF/SO_RCVBUFFORCE failed: %m"); + } + + if (s->send_buffer > 0) { + r = fd_set_sndbuf(fd, s->send_buffer, false); + if (r < 0) + log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "SO_SNDBUF/SO_SNDBUFFORCE failed: %m"); + } + + if (s->mark >= 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_MARK, s->mark); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_MARK failed: %m"); + } + + if (s->ip_tos >= 0) { + r = setsockopt_int(fd, IPPROTO_IP, IP_TOS, s->ip_tos); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "IP_TOS failed: %m"); + } + + if (s->ip_ttl >= 0) { + r = socket_set_ttl(fd, socket_address_family(&p->address), s->ip_ttl); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "IP_TTL/IPV6_UNICAST_HOPS failed: %m"); + } + + if (s->tcp_congestion) + if (setsockopt(fd, SOL_TCP, TCP_CONGESTION, s->tcp_congestion, strlen(s->tcp_congestion)+1) < 0) + log_unit_warning_errno(UNIT(s), errno, "TCP_CONGESTION failed: %m"); + + if (s->smack_ip_in) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_IPIN, s->smack_ip_in); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_in_fd: %m"); + } + + if (s->smack_ip_out) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_IPOUT, s->smack_ip_out); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_out_fd: %m"); + } +} + +static void socket_apply_fifo_options(Socket *s, int fd) { + int r; + + assert(s); + assert(fd >= 0); + + if (s->pipe_size > 0) + if (fcntl(fd, F_SETPIPE_SZ, s->pipe_size) < 0) + log_unit_warning_errno(UNIT(s), errno, "Setting pipe size failed, ignoring: %m"); + + if (s->smack) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_ACCESS, s->smack); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "SMACK relabelling failed, ignoring: %m"); + } +} + +static int fifo_address_create( + const char *path, + mode_t directory_mode, + mode_t socket_mode) { + + _cleanup_close_ int fd = -EBADF; + mode_t old_mask; + struct stat st; + int r; + + assert(path); + + (void) mkdir_parents_label(path, directory_mode); + + r = mac_selinux_create_file_prepare(path, S_IFIFO); + if (r < 0) + return r; + + /* Enforce the right access mode for the fifo */ + old_mask = umask(~socket_mode); + + /* Include the original umask in our mask */ + (void) umask(~socket_mode | old_mask); + + r = mkfifo(path, socket_mode); + (void) umask(old_mask); + + if (r < 0 && errno != EEXIST) { + r = -errno; + goto fail; + } + + fd = open(path, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK | O_NOFOLLOW); + if (fd < 0) { + r = -errno; + goto fail; + } + + mac_selinux_create_file_clear(); + + if (fstat(fd, &st) < 0) { + r = -errno; + goto fail; + } + + if (!S_ISFIFO(st.st_mode) || + (st.st_mode & 0777) != (socket_mode & ~old_mask) || + st.st_uid != getuid() || + st.st_gid != getgid()) { + r = -EEXIST; + goto fail; + } + + return TAKE_FD(fd); + +fail: + mac_selinux_create_file_clear(); + return r; +} + +static int special_address_create(const char *path, bool writable) { + _cleanup_close_ int fd = -EBADF; + struct stat st; + + assert(path); + + fd = open(path, (writable ? O_RDWR : O_RDONLY)|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check whether this is a /proc, /sys or /dev file or char device */ + if (!S_ISREG(st.st_mode) && !S_ISCHR(st.st_mode)) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int usbffs_address_create(const char *path) { + _cleanup_close_ int fd = -EBADF; + struct stat st; + + assert(path); + + fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check whether this is a regular file (ffs endpoint) */ + if (!S_ISREG(st.st_mode)) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int mq_address_create( + const char *path, + mode_t mq_mode, + long maxmsg, + long msgsize) { + + _cleanup_close_ int fd = -EBADF; + struct stat st; + mode_t old_mask; + struct mq_attr _attr, *attr = NULL; + + assert(path); + + if (maxmsg > 0 && msgsize > 0) { + _attr = (struct mq_attr) { + .mq_flags = O_NONBLOCK, + .mq_maxmsg = maxmsg, + .mq_msgsize = msgsize, + }; + attr = &_attr; + } + + /* Enforce the right access mode for the mq */ + old_mask = umask(~mq_mode); + + /* Include the original umask in our mask */ + (void) umask(~mq_mode | old_mask); + fd = mq_open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_CREAT, mq_mode, attr); + (void) umask(old_mask); + + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if ((st.st_mode & 0777) != (mq_mode & ~old_mask) || + st.st_uid != getuid() || + st.st_gid != getgid()) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int socket_symlink(Socket *s) { + const char *p; + int r; + + assert(s); + + p = socket_find_symlink_target(s); + if (!p) + return 0; + + STRV_FOREACH(i, s->symlinks) { + (void) mkdir_parents_label(*i, s->directory_mode); + + r = symlink_idempotent(p, *i, false); + + if (r == -EEXIST && s->remove_on_stop) { + /* If there's already something where we want to create the symlink, and the destructive + * RemoveOnStop= mode is set, then we might as well try to remove what already exists and try + * again. */ + + if (unlink(*i) >= 0) + r = symlink_idempotent(p, *i, false); + } + + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to create symlink %s %s %s, ignoring: %m", + p, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), *i); + } + + return 0; +} + +static int usbffs_write_descs(int fd, Service *s) { + int r; + + if (!s->usb_function_descriptors || !s->usb_function_strings) + return -EINVAL; + + r = copy_file_fd(s->usb_function_descriptors, fd, 0); + if (r < 0) + return r; + + return copy_file_fd(s->usb_function_strings, fd, 0); +} + +static int usbffs_select_ep(const struct dirent *d) { + return d->d_name[0] != '.' && !streq(d->d_name, "ep0"); +} + +static int usbffs_dispatch_eps(SocketPort *p) { + _cleanup_free_ struct dirent **ent = NULL; + size_t n, k; + int r; + + r = scandir(p->path, &ent, usbffs_select_ep, alphasort); + if (r < 0) + return -errno; + + n = (size_t) r; + p->auxiliary_fds = new(int, n); + if (!p->auxiliary_fds) { + r = -ENOMEM; + goto clear; + } + + p->n_auxiliary_fds = n; + + k = 0; + for (size_t i = 0; i < n; ++i) { + _cleanup_free_ char *ep = NULL; + + ep = path_make_absolute(ent[i]->d_name, p->path); + if (!ep) { + r = -ENOMEM; + goto fail; + } + + path_simplify(ep); + + r = usbffs_address_create(ep); + if (r < 0) + goto fail; + + p->auxiliary_fds[k++] = r; + } + + r = 0; + goto clear; + +fail: + close_many(p->auxiliary_fds, k); + p->auxiliary_fds = mfree(p->auxiliary_fds); + p->n_auxiliary_fds = 0; + +clear: + free_many((void**) ent, n); + return r; +} + +int socket_load_service_unit(Socket *s, int cfd, Unit **ret) { + /* Figure out what the unit that will be used to handle the connections on the socket looks like. + * + * If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake + * instance name. + */ + + if (UNIT_ISSET(s->service)) { + *ret = UNIT_DEREF(s->service); + return 0; + } + + if (!s->accept) + return -ENODATA; + + /* Build the instance name and load the unit */ + _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL; + int r; + + r = unit_name_to_prefix(UNIT(s)->id, &prefix); + if (r < 0) + return r; + + if (cfd >= 0) { + r = instance_from_socket(cfd, s->n_accepted, &instance); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) + /* ENOTCONN is legitimate if TCP RST was received. Other socket families might return + * different errors. This connection is over, but the socket unit lives on. */ + return log_unit_debug_errno(UNIT(s), r, + "Got %s on incoming socket, assuming aborted connection attempt, ignoring.", + errno_to_name(r)); + return r; + } + } + + /* For accepting sockets, we don't know how the instance will be called until we get a connection and + * can figure out what the peer name is. So let's use "internal" as the instance to make it clear + * that this is not an actual peer name. We use "unknown" when we cannot figure out the peer. */ + r = unit_name_build(prefix, instance ?: "internal", ".service", &name); + if (r < 0) + return r; + + return manager_load_unit(UNIT(s)->manager, name, NULL, NULL, ret); +} + +static int socket_determine_selinux_label(Socket *s, char **ret) { + int r; + + assert(s); + assert(ret); + + Unit *service; + ExecCommand *c; + const char *exec_context; + _cleanup_free_ char *path = NULL; + + r = socket_load_service_unit(s, -1, &service); + if (r == -ENODATA) + goto no_label; + if (r < 0) + return r; + + exec_context = SERVICE(service)->exec_context.selinux_context; + if (exec_context) { + char *con; + + con = strdup(exec_context); + if (!con) + return -ENOMEM; + + *ret = TAKE_PTR(con); + return 0; + } + + c = SERVICE(service)->exec_command[SERVICE_EXEC_START]; + if (!c) + goto no_label; + + r = chase(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL); + if (r < 0) + goto no_label; + + r = mac_selinux_get_create_label_from_exe(path, ret); + if (IN_SET(r, -EPERM, -EOPNOTSUPP)) + goto no_label; + return r; + +no_label: + *ret = NULL; + return 0; +} + +static int socket_address_listen_do( + Socket *s, + const SocketAddress *address, + const char *label) { + + assert(s); + assert(address); + + return socket_address_listen( + address, + SOCK_CLOEXEC|SOCK_NONBLOCK, + s->backlog, + s->bind_ipv6_only, + s->bind_to_device, + s->reuse_port, + s->free_bind, + s->transparent, + s->directory_mode, + s->socket_mode, + label); +} + +#define log_address_error_errno(u, address, error, fmt) \ + ({ \ + _cleanup_free_ char *_t = NULL; \ + \ + (void) socket_address_print(address, &_t); \ + log_unit_error_errno(u, error, fmt, strna(_t)); \ + }) + +static int fork_needed(const SocketAddress *address, Socket *s) { + int r; + + assert(address); + assert(s); + + /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */ + + /* If there are any NFTSet= directives with cgroup source, we need the cgroup */ + Unit *u = UNIT(s); + CGroupContext *c = unit_get_cgroup_context(u); + if (c) + FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) + if (nft_set->source == NFT_SET_SOURCE_CGROUP) + return true; + + if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ + return true; + } + + return exec_needs_network_namespace(&s->exec_context); +} + +static int socket_address_listen_in_cgroup( + Socket *s, + const SocketAddress *address, + const char *label) { + + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + int fd, r; + + assert(s); + assert(address); + + /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the + * socket's cgroup and network namespace in which the socket is actually created. This way we ensure + * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and + * such. */ + + r = fork_needed(address, s); + if (r < 0) + return r; + if (r == 0) { + /* Shortcut things... */ + fd = socket_address_listen_do(s, address, label); + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + + return fd; + } + + r = unit_setup_exec_runtime(UNIT(s)); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m"); + + if (s->exec_context.network_namespace_path && + s->exec_runtime && + s->exec_runtime->shared && + s->exec_runtime->shared->netns_storage_socket[0] >= 0) { + r = open_shareable_ns_path(s->exec_runtime->shared->netns_storage_socket, s->exec_context.network_namespace_path, CLONE_NEWNET); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path); + } + + if (s->exec_context.ipc_namespace_path && + s->exec_runtime && + s->exec_runtime->shared && + s->exec_runtime->shared->ipcns_storage_socket[0] >= 0) { + r = open_shareable_ns_path(s->exec_runtime->shared->ipcns_storage_socket, s->exec_context.ipc_namespace_path, CLONE_NEWIPC); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to open IPC namespace path %s: %m", s->exec_context.ipc_namespace_path); + } + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), "(sd-listen)", &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + if (exec_needs_network_namespace(&s->exec_context) && + s->exec_runtime && + s->exec_runtime->shared && + s->exec_runtime->shared->netns_storage_socket[0] >= 0) { + + if (ns_type_supported(NAMESPACE_NET)) { + r = setup_shareable_ns(s->exec_runtime->shared->netns_storage_socket, CLONE_NEWNET); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m"); + _exit(EXIT_NETWORK); + } + } else if (s->exec_context.network_namespace_path) { + log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported."); + _exit(EXIT_NETWORK); + } else + log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); + } + + fd = socket_address_listen_do(s, address, label); + if (fd < 0) { + log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], fd, 0); + if (r < 0) { + log_address_error_errno(UNIT(s), address, r, "Failed to send listening socket (%s) to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + fd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_check("(sd-listen)", pid.pid, WAIT_LOG_ABNORMAL); + if (r < 0) { + safe_close(fd); + return r; + } + + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m"); + + return fd; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(Socket *, socket_close_fds, NULL); + +static int socket_open_fds(Socket *orig_s) { + _cleanup_(socket_close_fdsp) Socket *s = orig_s; + _cleanup_(mac_selinux_freep) char *label = NULL; + bool know_label = false; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + + if (p->fd >= 0) + continue; + + switch (p->type) { + + case SOCKET_SOCKET: + + if (!know_label) { + /* Figure out the label, if we don't it know yet. We do it once for the first + * socket where we need this and remember it for the rest. */ + + r = socket_determine_selinux_label(s, &label); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to determine SELinux label: %m"); + + know_label = true; + } + + /* Apply the socket protocol */ + switch (p->address.type) { + + case SOCK_STREAM: + case SOCK_SEQPACKET: + if (s->socket_protocol == IPPROTO_SCTP) + p->address.protocol = s->socket_protocol; + break; + + case SOCK_DGRAM: + if (s->socket_protocol == IPPROTO_UDPLITE) + p->address.protocol = s->socket_protocol; + break; + } + + p->fd = socket_address_listen_in_cgroup(s, &p->address, label); + if (p->fd < 0) + return p->fd; + + socket_apply_socket_options(s, p, p->fd); + socket_symlink(s); + break; + + case SOCKET_SPECIAL: + + p->fd = special_address_create(p->path, s->writable); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open special file %s: %m", p->path); + break; + + case SOCKET_FIFO: + + p->fd = fifo_address_create( + p->path, + s->directory_mode, + s->socket_mode); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open FIFO %s: %m", p->path); + + socket_apply_fifo_options(s, p->fd); + socket_symlink(s); + break; + + case SOCKET_MQUEUE: + + p->fd = mq_address_create( + p->path, + s->socket_mode, + s->mq_maxmsg, + s->mq_msgsize); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open message queue %s: %m", p->path); + break; + + case SOCKET_USB_FUNCTION: { + _cleanup_free_ char *ep = NULL; + + ep = path_make_absolute("ep0", p->path); + if (!ep) + return -ENOMEM; + + p->fd = usbffs_address_create(ep); + if (p->fd < 0) + return p->fd; + + r = usbffs_write_descs(p->fd, SERVICE(UNIT_DEREF(s->service))); + if (r < 0) + return r; + + r = usbffs_dispatch_eps(p); + if (r < 0) + return r; + + break; + } + default: + assert_not_reached(); + } + } + + s = NULL; + return 0; +} + +static void socket_unwatch_fds(Socket *s) { + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + if (!p->event_source) + continue; + + r = sd_event_source_set_enabled(p->event_source, SD_EVENT_OFF); + if (r < 0) + log_unit_debug_errno(UNIT(s), r, "Failed to disable event source: %m"); + } +} + +static int socket_watch_fds(Socket *s) { + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + if (p->event_source) { + r = sd_event_source_set_enabled(p->event_source, SD_EVENT_ON); + if (r < 0) + goto fail; + } else { + r = sd_event_add_io(UNIT(s)->manager->event, &p->event_source, p->fd, EPOLLIN, socket_dispatch_io, p); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(p->event_source, "socket-port-io"); + } + + r = sd_event_source_set_ratelimit(p->event_source, s->poll_limit_interval, s->poll_limit_burst); + if (r < 0) + log_unit_debug_errno(UNIT(s), r, "Failed to set poll limit on I/O event source, ignoring: %m"); + } + + return 0; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to watch listening fds: %m"); + socket_unwatch_fds(s); + return r; +} + +enum { + SOCKET_OPEN_NONE, + SOCKET_OPEN_SOME, + SOCKET_OPEN_ALL, +}; + +static int socket_check_open(Socket *s) { + bool have_open = false, have_closed = false; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + have_closed = true; + else + have_open = true; + + if (have_open && have_closed) + return SOCKET_OPEN_SOME; + } + + if (have_open) + return SOCKET_OPEN_ALL; + + return SOCKET_OPEN_NONE; +} + +static void socket_set_state(Socket *s, SocketState state) { + SocketState old_state; + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!IN_SET(state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING)) { + + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + socket_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + } + + if (state != SOCKET_LISTENING) + socket_unwatch_fds(s); + + if (!IN_SET(state, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_LISTENING, + SOCKET_RUNNING, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_CLEANING)) + socket_close_fds(s); + + if (state != old_state) + log_unit_debug(UNIT(s), "Changed %s -> %s", socket_state_to_string(old_state), socket_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static int socket_coldplug(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(s->state == SOCKET_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + if (pidref_is_set(&s->control_pid) && + pidref_is_unwaited(&s->control_pid) > 0 && + IN_SET(s->deserialized_state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING)) { + + r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false); + if (r < 0) + return r; + + r = socket_arm_timer(s, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec)); + if (r < 0) + return r; + } + + if (IN_SET(s->deserialized_state, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_LISTENING, + SOCKET_RUNNING)) { + + /* Originally, we used to simply reopen all sockets here that we didn't have file descriptors + * for. However, this is problematic, as we won't traverse through the SOCKET_START_CHOWN state for + * them, and thus the UID/GID wouldn't be right. Hence, instead simply check if we have all fds open, + * and if there's a mismatch, warn loudly. */ + + r = socket_check_open(s); + if (r == SOCKET_OPEN_NONE) + log_unit_warning(UNIT(s), + "Socket unit configuration has changed while unit has been running, " + "no open socket file descriptor left. " + "The socket unit is not functional until restarted."); + else if (r == SOCKET_OPEN_SOME) + log_unit_warning(UNIT(s), + "Socket unit configuration has changed while unit has been running, " + "and some socket file descriptors have not been opened yet. " + "The socket unit is not fully functional until restarted."); + } + + if (s->deserialized_state == SOCKET_LISTENING) { + r = socket_watch_fds(s); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, SOCKET_DEAD, SOCKET_FAILED, SOCKET_CLEANING)) + (void) unit_setup_exec_runtime(u); + + socket_set_state(s, s->deserialized_state); + return 0; +} + +static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) { + + _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT( + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN); + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(ret_pid); + + r = unit_prepare_exec(UNIT(s)); + if (r < 0) + return r; + + r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->cgroup_context, + &pid); + if (r < 0) + return r; + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return r; + + r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true); + if (r < 0) + return r; + + *ret_pid = TAKE_PIDREF(pidref); + return 0; +} + +static int socket_chown(Socket *s, PidRef *ret_pid) { + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + int r; + + assert(s); + + r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec); + if (r < 0) + return r; + + /* We have to resolve the user names out-of-process, hence + * let's fork here. It's messy, but well, what can we do? */ + + r = unit_fork_helper_process(UNIT(s), "(sd-chown)", &pid); + if (r < 0) + return r; + if (r == 0) { + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + + /* Child */ + + if (!isempty(s->user)) { + const char *user = s->user; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user); + _exit(EXIT_USER); + } + } + + if (!isempty(s->group)) { + const char *group = s->group; + + r = get_group_creds(&group, &gid, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group); + _exit(EXIT_GROUP); + } + } + + LIST_FOREACH(port, p, s->ports) { + const char *path = NULL; + + if (p->type == SOCKET_SOCKET) + path = socket_address_get_path(&p->address); + else if (p->type == SOCKET_FIFO) + path = p->path; + + if (!path) + continue; + + if (chown(path, uid, gid) < 0) { + log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m"); + _exit(EXIT_CHOWN); + } + } + + _exit(EXIT_SUCCESS); + } + + r = unit_watch_pidref(UNIT(s), &pid, /* exclusive= */ true); + if (r < 0) + return r; + + *ret_pid = TAKE_PIDREF(pid); + return 0; +} + +static void socket_enter_dead(Socket *s, SocketResult f) { + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + if (s->result == SOCKET_SUCCESS) + unit_log_success(UNIT(s)); + else + unit_log_failure(UNIT(s), socket_result_to_string(s->result)); + + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop); + + socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD); + + s->exec_runtime = exec_runtime_destroy(s->exec_runtime); + + unit_destroy_runtime_data(UNIT(s), &s->exec_context); + + unit_unref_uid_gid(UNIT(s), true); +} + +static void socket_enter_signal(Socket *s, SocketState state, SocketResult f); + +static void socket_enter_stop_post(Socket *s, SocketResult f) { + int r; + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_STOP_POST; + s->control_command = s->exec_command[SOCKET_EXEC_STOP_POST]; + + if (s->control_command) { + pidref_done(&s->control_pid); + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m"); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES); + return; + } + + socket_set_state(s, SOCKET_STOP_POST); + } else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_SUCCESS); +} + +static int state_to_kill_operation(Socket *s, SocketState state) { + if (state == SOCKET_STOP_PRE_SIGTERM && unit_has_job_type(UNIT(s), JOB_RESTART)) + return KILL_RESTART; + + if (state == SOCKET_FINAL_SIGTERM) + return KILL_TERMINATE; + + return KILL_KILL; +} + +static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) { + int r; + + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state_to_kill_operation(s, state), + /* main_pid= */ NULL, + &s->control_pid, + /* main_pid_alien= */ false); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + goto fail; + } + + if (r > 0) { + r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + goto fail; + } + + socket_set_state(s, state); + } else if (state == SOCKET_STOP_PRE_SIGTERM) + socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_SUCCESS); + else if (state == SOCKET_STOP_PRE_SIGKILL) + socket_enter_stop_post(s, SOCKET_SUCCESS); + else if (state == SOCKET_FINAL_SIGTERM) + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS); + else + socket_enter_dead(s, SOCKET_SUCCESS); + + return; + +fail: + if (IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_STOP_PRE_SIGKILL)) + socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES); + else + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_stop_pre(Socket *s, SocketResult f) { + int r; + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_STOP_PRE; + s->control_command = s->exec_command[SOCKET_EXEC_STOP_PRE]; + + if (s->control_command) { + pidref_done(&s->control_pid); + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-pre' task: %m"); + socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES); + return; + } + + socket_set_state(s, SOCKET_STOP_PRE); + } else + socket_enter_stop_post(s, SOCKET_SUCCESS); +} + +static void socket_enter_listening(Socket *s) { + int r; + assert(s); + + if (!s->accept && s->flush_pending) { + log_unit_debug(UNIT(s), "Flushing socket before listening."); + flush_ports(s); + } + + r = socket_watch_fds(s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to watch sockets: %m"); + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); + return; + } + + socket_set_state(s, SOCKET_LISTENING); +} + +static void socket_enter_start_post(Socket *s) { + int r; + assert(s); + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_START_POST; + s->control_command = s->exec_command[SOCKET_EXEC_START_POST]; + + if (s->control_command) { + pidref_done(&s->control_pid); + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m"); + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); + return; + } + + socket_set_state(s, SOCKET_START_POST); + } else + socket_enter_listening(s); +} + +static void socket_enter_start_chown(Socket *s) { + int r; + + assert(s); + + r = socket_open_fds(s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to listen on sockets: %m"); + goto fail; + } + + if (!isempty(s->user) || !isempty(s->group)) { + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_START_CHOWN; + s->control_command = NULL; + + r = socket_chown(s, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-chown' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_CHOWN); + } else + socket_enter_start_post(s); + + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_pre(Socket *s) { + int r; + assert(s); + + socket_unwatch_control_pid(s); + + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start); + + s->control_command_id = SOCKET_EXEC_START_PRE; + s->control_command = s->exec_command[SOCKET_EXEC_START_PRE]; + + if (s->control_command) { + pidref_done(&s->control_pid); + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m"); + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); + return; + } + + socket_set_state(s, SOCKET_START_PRE); + } else + socket_enter_start_chown(s); +} + +static void flush_ports(Socket *s) { + assert(s); + + /* Flush all incoming traffic, regardless if actual bytes or new connections, so that this socket isn't busy + * anymore */ + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + (void) flush_accept(p->fd); + (void) flush_fd(p->fd); + } +} + +static void socket_enter_running(Socket *s, int cfd_in) { + /* Note that this call takes possession of the connection fd passed. It either has to assign it + * somewhere or close it. */ + _cleanup_close_ int cfd = cfd_in; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + /* We don't take connections anymore if we are supposed to shut down anyway */ + if (unit_stop_pending(UNIT(s))) { + + log_unit_debug(UNIT(s), "Suppressing connection request since unit stop is scheduled."); + + if (cfd >= 0) + goto refuse; + + flush_ports(s); + return; + } + + if (!ratelimit_below(&s->trigger_limit)) { + log_unit_warning(UNIT(s), "Trigger limit hit, refusing further activation."); + socket_enter_stop_pre(s, SOCKET_FAILURE_TRIGGER_LIMIT_HIT); + goto refuse; + } + + if (cfd < 0) { /* Accept=no case */ + bool pending = false; + Unit *other; + + /* If there's already a start pending don't bother to do anything */ + UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_TRIGGERS) + if (unit_active_or_pending(other)) { + pending = true; + break; + } + + if (!pending) { + if (!UNIT_ISSET(s->service)) { + r = log_unit_warning_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), + "Service to activate vanished, refusing activation."); + goto fail; + } + + r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT_DEREF(s->service), JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto queue_error; + } + + socket_set_state(s, SOCKET_RUNNING); + } else { /* Accept=yes case */ + _cleanup_(socket_peer_unrefp) SocketPeer *p = NULL; + Unit *service; + + if (s->n_connections >= s->max_connections) { + log_unit_warning(UNIT(s), "Too many incoming connections (%u), dropping connection.", + s->n_connections); + goto refuse; + } + + if (s->max_connections_per_source > 0) { + r = socket_acquire_peer(s, cfd, &p); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) + return; + /* We didn't have enough resources to acquire peer information, let's fail. */ + goto fail; + } + if (r > 0 && p->n_ref > s->max_connections_per_source) { + _cleanup_free_ char *t = NULL; + + (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t); + + log_unit_warning(UNIT(s), + "Too many incoming connections (%u) from source %s, dropping connection.", + p->n_ref, strnull(t)); + goto refuse; + } + } + + r = socket_load_service_unit(s, cfd, &service); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) + return; + + log_unit_warning_errno(UNIT(s), r, "Failed to load connection service unit: %m"); + goto fail; + } + + r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service, + false, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to add Before=/Triggers= dependencies on connection unit: %m"); + goto fail; + } + + s->n_accepted++; + + r = service_set_socket_fd(SERVICE(service), cfd, s, p, s->selinux_context_from_net); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) + return; + + log_unit_warning_errno(UNIT(s), r, "Failed to set socket on service: %m"); + goto fail; + } + + TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */ + s->n_connections++; + + r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) { + /* We failed to activate the new service, but it still exists. Let's make sure the + * service closes and forgets the connection fd again, immediately. */ + service_release_socket_fd(SERVICE(service)); + goto queue_error; + } + + /* Notify clients about changed counters */ + unit_add_to_dbus_queue(UNIT(s)); + } + + return; + +refuse: + s->n_refused++; + return; + +queue_error: + if (ERRNO_IS_RESOURCE(r)) + log_unit_warning(UNIT(s), "Failed to queue service startup job: %s", + bus_error_message(&error, r)); + else + log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s", + cfd >= 0 ? "template" : "non-template", + bus_error_message(&error, r)); + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_run_next(Socket *s) { + int r; + + assert(s); + assert(s->control_command); + assert(s->control_command->command_next); + + socket_unwatch_control_pid(s); + + s->control_command = s->control_command->command_next; + + pidref_done(&s->control_pid); + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn next task: %m"); + + if (s->state == SOCKET_START_POST) + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); + else if (s->state == SOCKET_STOP_POST) + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); + else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES); + } +} + +static int socket_start(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(s->state, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(s->state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST)) + return 0; + + /* Cannot run this without the service being around */ + if (UNIT_ISSET(s->service)) { + Service *service; + + service = SERVICE(UNIT_DEREF(s->service)); + + if (UNIT(service)->load_state != UNIT_LOADED) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), + "Socket service %s not loaded, refusing.", UNIT(service)->id); + + /* If the service is already active we cannot start the + * socket */ + if (!IN_SET(service->state, + SERVICE_DEAD, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED, SERVICE_FAILED_BEFORE_AUTO_RESTART, + SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED)) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), + "Socket service %s already active, refusing.", UNIT(service)->id); + } + + assert(IN_SET(s->state, SOCKET_DEAD, SOCKET_FAILED)); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + s->result = SOCKET_SUCCESS; + exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); + + u->reset_accounting = true; + + socket_enter_start_pre(s); + return 1; +} + +static int socket_stop(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + /* Already on it */ + if (IN_SET(s->state, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL)) + return 0; + + /* If there's already something running we go directly into + * kill mode. */ + if (IN_SET(s->state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST)) { + socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_SUCCESS); + return -EAGAIN; + } + + /* If we are currently cleaning, then abort it, brutally. */ + if (s->state == SOCKET_CLEANING) { + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS); + return 0; + } + + assert(IN_SET(s->state, SOCKET_LISTENING, SOCKET_RUNNING)); + + socket_enter_stop_pre(s, SOCKET_SUCCESS); + return 1; +} + +static int socket_serialize(Unit *u, FILE *f, FDSet *fds) { + Socket *s = SOCKET(u); + int r; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", socket_state_to_string(s->state)); + (void) serialize_item(f, "result", socket_result_to_string(s->result)); + (void) serialize_item_format(f, "n-accepted", "%u", s->n_accepted); + (void) serialize_item_format(f, "n-refused", "%u", s->n_refused); + (void) serialize_pidref(f, fds, "control-pid", &s->control_pid); + + if (s->control_command_id >= 0) + (void) serialize_item(f, "control-command", socket_exec_command_to_string(s->control_command_id)); + + LIST_FOREACH(port, p, s->ports) { + int copy; + + if (p->fd < 0) + continue; + + copy = fdset_put_dup(fds, p->fd); + if (copy < 0) + return log_unit_warning_errno(u, copy, "Failed to serialize socket fd: %m"); + + if (p->type == SOCKET_SOCKET) { + _cleanup_free_ char *t = NULL; + + r = socket_address_print(&p->address, &t); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to format socket address: %m"); + + if (socket_address_family(&p->address) == AF_NETLINK) + (void) serialize_item_format(f, "netlink", "%i %s", copy, t); + else + (void) serialize_item_format(f, "socket", "%i %i %s", copy, p->address.type, t); + } else if (p->type == SOCKET_SPECIAL) + (void) serialize_item_format(f, "special", "%i %s", copy, p->path); + else if (p->type == SOCKET_MQUEUE) + (void) serialize_item_format(f, "mqueue", "%i %s", copy, p->path); + else if (p->type == SOCKET_USB_FUNCTION) + (void) serialize_item_format(f, "ffs", "%i %s", copy, p->path); + else { + assert(p->type == SOCKET_FIFO); + (void) serialize_item_format(f, "fifo", "%i %s", copy, p->path); + } + } + + (void) serialize_ratelimit(f, "trigger-ratelimit", &s->trigger_limit); + + return 0; +} + +static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Socket *s = SOCKET(u); + int r; + + assert(u); + assert(key); + assert(value); + + if (streq(key, "state")) { + SocketState state; + + state = socket_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + SocketResult f; + + f = socket_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SOCKET_SUCCESS) + s->result = f; + + } else if (streq(key, "n-accepted")) { + unsigned k; + + if (safe_atou(value, &k) < 0) + log_unit_debug(u, "Failed to parse n-accepted value: %s", value); + else + s->n_accepted += k; + } else if (streq(key, "n-refused")) { + unsigned k; + + if (safe_atou(value, &k) < 0) + log_unit_debug(u, "Failed to parse n-refused value: %s", value); + else + s->n_refused += k; + } else if (streq(key, "control-pid")) { + pidref_done(&s->control_pid); + (void) deserialize_pidref(fds, value, &s->control_pid); + + } else if (streq(key, "control-command")) { + SocketExecCommand id; + + id = socket_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + s->control_command_id = id; + s->control_command = s->exec_command[id]; + } + } else if (streq(key, "fifo")) { + _cleanup_free_ char *fdv = NULL; + bool found = false; + int fd; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse fifo value: %s", value); + return 0; + } + + fd = parse_fd(fdv); + if (fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Invalid fifo value: %s", fdv); + return 0; + } + + LIST_FOREACH(port, p, s->ports) + if (p->fd < 0 && + p->type == SOCKET_FIFO && + path_equal_or_inode_same(p->path, value, 0)) { + p->fd = fdset_remove(fds, fd); + found = true; + break; + } + if (!found) + log_unit_debug(u, "No matching fifo socket found: %s", value); + + } else if (streq(key, "special")) { + _cleanup_free_ char *fdv = NULL; + bool found = false; + int fd; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse special value: %s", value); + return 0; + } + + fd = parse_fd(fdv); + if (fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Invalid special value: %s", fdv); + return 0; + } + + LIST_FOREACH(port, p, s->ports) + if (p->fd < 0 && + p->type == SOCKET_SPECIAL && + path_equal_or_inode_same(p->path, value, 0)) { + p->fd = fdset_remove(fds, fd); + found = true; + break; + } + if (!found) + log_unit_debug(u, "No matching special socket found: %s", value); + + } else if (streq(key, "mqueue")) { + _cleanup_free_ char *fdv = NULL; + bool found = false; + int fd; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse mqueue value: %s", value); + return 0; + } + + fd = parse_fd(fdv); + if (fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Invalid mqueue value: %s", fdv); + return 0; + } + + LIST_FOREACH(port, p, s->ports) + if (p->fd < 0 && + p->type == SOCKET_MQUEUE && + streq(p->path, value)) { + p->fd = fdset_remove(fds, fd); + found = true; + break; + } + if (!found) + log_unit_debug(u, "No matching mqueue socket found: %s", value); + + } else if (streq(key, "socket")) { + _cleanup_free_ char *fdv = NULL, *typev = NULL; + bool found = false; + int fd, type; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse socket fd from value: %s", value); + return 0; + } + + fd = parse_fd(fdv); + if (fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Invalid socket fd: %s", fdv); + return 0; + } + + r = extract_first_word(&value, &typev, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse socket type from value: %s", value); + return 0; + } + + if (safe_atoi(typev, &type) < 0 || type < 0) { + log_unit_debug(u, "Invalid socket type: %s", typev); + return 0; + } + + LIST_FOREACH(port, p, s->ports) + if (p->fd < 0 && + socket_address_is(&p->address, value, type)) { + p->fd = fdset_remove(fds, fd); + found = true; + break; + } + if (!found) + log_unit_debug(u, "No matching %s socket found: %s", + socket_address_type_to_string(type), value); + + } else if (streq(key, "netlink")) { + _cleanup_free_ char *fdv = NULL; + bool found = false; + int fd; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse socket value: %s", value); + return 0; + } + + fd = parse_fd(fdv); + if (fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Invalid socket value: %s", fdv); + return 0; + } + + LIST_FOREACH(port, p, s->ports) + if (p->fd < 0 && + socket_address_is_netlink(&p->address, value)) { + p->fd = fdset_remove(fds, fd); + found = true; + break; + } + if (!found) + log_unit_debug(u, "No matching netlink socket found: %s", value); + + } else if (streq(key, "ffs")) { + _cleanup_free_ char *fdv = NULL; + bool found = false; + int fd; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0) { + log_unit_debug(u, "Failed to parse ffs value: %s", value); + return 0; + } + + fd = parse_fd(fdv); + if (fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Invalid ffs value: %s", fdv); + return 0; + } + + LIST_FOREACH(port, p, s->ports) + if (p->fd < 0 && + p->type == SOCKET_USB_FUNCTION && + path_equal_or_inode_same(p->path, value, 0)) { + p->fd = fdset_remove(fds, fd); + found = true; + break; + } + if (!found) + log_unit_debug(u, "No matching ffs socket found: %s", value); + + } else if (streq(key, "trigger-ratelimit")) + deserialize_ratelimit(&s->trigger_limit, key, value); + + else + log_unit_debug(UNIT(s), "Unknown serialization key: %s", key); + + return 0; +} + +static void socket_distribute_fds(Unit *u, FDSet *fds) { + Socket *s = SOCKET(u); + + assert(u); + + LIST_FOREACH(port, p, s->ports) { + int fd; + + if (p->type != SOCKET_SOCKET) + continue; + + if (p->fd >= 0) + continue; + + FDSET_FOREACH(fd, fds) { + if (socket_address_matches_fd(&p->address, fd)) { + p->fd = fdset_remove(fds, fd); + s->deserialized_state = SOCKET_LISTENING; + break; + } + } + } +} + +static UnitActiveState socket_active_state(Unit *u) { + assert(u); + + return state_translation_table[SOCKET(u)->state]; +} + +static const char *socket_sub_state_to_string(Unit *u) { + assert(u); + + return socket_state_to_string(SOCKET(u)->state); +} + +int socket_port_to_address(const SocketPort *p, char **ret) { + _cleanup_free_ char *address = NULL; + int r; + + assert(p); + assert(ret); + + switch (p->type) { + case SOCKET_SOCKET: { + r = socket_address_print(&p->address, &address); + if (r < 0) + return r; + + break; + } + + case SOCKET_SPECIAL: + case SOCKET_MQUEUE: + case SOCKET_FIFO: + case SOCKET_USB_FUNCTION: + address = strdup(p->path); + if (!address) + return -ENOMEM; + break; + + default: + assert_not_reached(); + } + + *ret = TAKE_PTR(address); + + return 0; +} + +const char* socket_port_type_to_string(SocketPort *p) { + + assert(p); + + switch (p->type) { + + case SOCKET_SOCKET: + + switch (p->address.type) { + + case SOCK_STREAM: + return "Stream"; + + case SOCK_DGRAM: + return "Datagram"; + + case SOCK_SEQPACKET: + return "SequentialPacket"; + + case SOCK_RAW: + if (socket_address_family(&p->address) == AF_NETLINK) + return "Netlink"; + + _fallthrough_; + default: + return NULL; + } + + case SOCKET_SPECIAL: + return "Special"; + + case SOCKET_MQUEUE: + return "MessageQueue"; + + case SOCKET_FIFO: + return "FIFO"; + + case SOCKET_USB_FUNCTION: + return "USBFunction"; + + default: + return NULL; + } +} + +SocketType socket_port_type_from_string(const char *s) { + assert(s); + + if (STR_IN_SET(s, "Stream", "Datagram", "SequentialPacket", "Netlink")) + return SOCKET_SOCKET; + else if (streq(s, "Special")) + return SOCKET_SPECIAL; + else if (streq(s, "MessageQueue")) + return SOCKET_MQUEUE; + else if (streq(s, "FIFO")) + return SOCKET_FIFO; + else if (streq(s, "USBFunction")) + return SOCKET_USB_FUNCTION; + else + return _SOCKET_TYPE_INVALID; +} + +static bool socket_may_gc(Unit *u) { + Socket *s = SOCKET(u); + + assert(u); + + return s->n_connections == 0; +} + +static int socket_accept_do(Socket *s, int fd) { + int cfd; + + assert(s); + assert(fd >= 0); + + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (cfd < 0) + /* Convert transient network errors into clean and well-defined EAGAIN */ + return ERRNO_IS_ACCEPT_AGAIN(errno) ? -EAGAIN : -errno; + + return cfd; +} + +static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) { + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + int cfd, r; + + assert(s); + assert(p); + assert(fd >= 0); + + /* Similar to socket_address_listen_in_cgroup(), but for accept() rather than socket(): make sure that any + * connection socket is also properly associated with the cgroup. */ + + if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6)) + goto shortcut; + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == BPF_FIREWALL_UNSUPPORTED) + goto shortcut; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), "(sd-accept)", &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + cfd = socket_accept_do(s, fd); + if (cfd == -EAGAIN) /* spurious accept() */ + _exit(EXIT_SUCCESS); + if (cfd < 0) { + log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], cfd, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + cfd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_check("(sd-accept)", pid.pid, WAIT_LOG_ABNORMAL); + if (r < 0) { + safe_close(cfd); + return r; + } + + /* If we received no fd, we got EIO here. If this happens with a process exit code of EXIT_SUCCESS + * this is a spurious accept(), let's convert that back to EAGAIN here. */ + if (cfd == -EIO) + return -EAGAIN; + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m"); + + return cfd; + +shortcut: + cfd = socket_accept_do(s, fd); + if (cfd == -EAGAIN) /* spurious accept(), skip it silently */ + return -EAGAIN; + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + + return cfd; +} + +static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + SocketPort *p = ASSERT_PTR(userdata); + int cfd = -EBADF; + + assert(fd >= 0); + + if (p->socket->state != SOCKET_LISTENING) + return 0; + + log_unit_debug(UNIT(p->socket), "Incoming traffic"); + + if (revents != EPOLLIN) { + if (revents & EPOLLHUP) + log_unit_error(UNIT(p->socket), "Got POLLHUP on a listening socket. The service probably invoked shutdown() on it, and should better not do that."); + else + log_unit_error(UNIT(p->socket), "Got unexpected poll event (0x%x) on socket.", revents); + goto fail; + } + + if (p->socket->accept && + p->type == SOCKET_SOCKET && + socket_address_can_accept(&p->address)) { + + cfd = socket_accept_in_cgroup(p->socket, p, fd); + if (cfd == -EAGAIN) /* Spurious accept() */ + return 0; + if (cfd < 0) + goto fail; + + socket_apply_socket_options(p->socket, p, cfd); + } + + socket_enter_running(p->socket, cfd); + return 0; + +fail: + socket_enter_stop_pre(p->socket, SOCKET_FAILURE_RESOURCES); + return 0; +} + +static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Socket *s = SOCKET(u); + SocketResult f; + + assert(s); + assert(pid >= 0); + + if (pid != s->control_pid.pid) + return; + + pidref_done(&s->control_pid); + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = SOCKET_SUCCESS; + else if (code == CLD_EXITED) + f = SOCKET_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SOCKET_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SOCKET_FAILURE_CORE_DUMP; + else + assert_not_reached(); + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SOCKET_SUCCESS; + } + + unit_log_process_exit( + u, + "Control process", + socket_exec_command_to_string(s->control_command_id), + f == SOCKET_SUCCESS, + code, status); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + if (s->control_command && + s->control_command->command_next && + f == SOCKET_SUCCESS) { + + log_unit_debug(u, "Running next command for state %s", socket_state_to_string(s->state)); + socket_run_next(s); + } else { + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + /* No further commands for this step, so let's figure + * out what to do next */ + + log_unit_debug(u, "Got final SIGCHLD for state %s", socket_state_to_string(s->state)); + + switch (s->state) { + + case SOCKET_START_PRE: + if (f == SOCKET_SUCCESS) + socket_enter_start_chown(s); + else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, f); + break; + + case SOCKET_START_CHOWN: + if (f == SOCKET_SUCCESS) + socket_enter_start_post(s); + else + socket_enter_stop_pre(s, f); + break; + + case SOCKET_START_POST: + if (f == SOCKET_SUCCESS) + socket_enter_listening(s); + else + socket_enter_stop_pre(s, f); + break; + + case SOCKET_STOP_PRE: + case SOCKET_STOP_PRE_SIGTERM: + case SOCKET_STOP_PRE_SIGKILL: + socket_enter_stop_post(s, f); + break; + + case SOCKET_STOP_POST: + case SOCKET_FINAL_SIGTERM: + case SOCKET_FINAL_SIGKILL: + socket_enter_dead(s, f); + break; + + case SOCKET_CLEANING: + + if (s->clean_result == SOCKET_SUCCESS) + s->clean_result = f; + + socket_enter_dead(s, SOCKET_SUCCESS); + break; + + default: + assert_not_reached(); + } + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Socket *s = SOCKET(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SOCKET_START_PRE: + log_unit_warning(UNIT(s), "Starting timed out. Terminating."); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_START_CHOWN: + case SOCKET_START_POST: + log_unit_warning(UNIT(s), "Starting timed out. Stopping."); + socket_enter_stop_pre(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_PRE: + log_unit_warning(UNIT(s), "Stopping timed out. Terminating."); + socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_PRE_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL. Ignoring."); + socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT); + } + break; + + case SOCKET_STOP_PRE_SIGKILL: + log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring."); + socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_POST: + log_unit_warning(UNIT(s), "Stopping timed out (2). Terminating."); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_FINAL_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out (2). Killing."); + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out (2). Skipping SIGKILL. Ignoring."); + socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT); + } + break; + + case SOCKET_FINAL_SIGKILL: + log_unit_warning(UNIT(s), "Still around after SIGKILL (2). Entering failed mode."); + socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_CLEANING: + log_unit_warning(UNIT(s), "Cleaning timed out. killing."); + + if (s->clean_result == SOCKET_SUCCESS) + s->clean_result = SOCKET_FAILURE_TIMEOUT; + + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, 0); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +int socket_collect_fds(Socket *s, int **fds) { + size_t k = 0, n = 0; + int *rfds; + + assert(s); + assert(fds); + + /* Called from the service code for requesting our fds */ + + LIST_FOREACH(port, p, s->ports) { + if (p->fd >= 0) + n++; + n += p->n_auxiliary_fds; + } + + if (n <= 0) { + *fds = NULL; + return 0; + } + + rfds = new(int, n); + if (!rfds) + return -ENOMEM; + + LIST_FOREACH(port, p, s->ports) { + if (p->fd >= 0) + rfds[k++] = p->fd; + for (size_t i = 0; i < p->n_auxiliary_fds; ++i) + rfds[k++] = p->auxiliary_fds[i]; + } + + assert(k == n); + + *fds = rfds; + return (int) n; +} + +static void socket_reset_failed(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + if (s->state == SOCKET_FAILED) + socket_set_state(s, SOCKET_DEAD); + + s->result = SOCKET_SUCCESS; + s->clean_result = SOCKET_SUCCESS; +} + +void socket_connection_unref(Socket *s) { + assert(s); + + /* The service is dead. Yay! + * + * This is strictly for one-instance-per-connection + * services. */ + + assert(s->n_connections > 0); + s->n_connections--; + + log_unit_debug(UNIT(s), "One connection closed, %u left.", s->n_connections); +} + +static void socket_trigger_notify(Unit *u, Unit *other) { + Socket *s = SOCKET(u); + + assert(u); + assert(other); + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + assert(other->type == UNIT_SERVICE); + + /* Don't propagate state changes from the service if we are already down */ + if (!IN_SET(s->state, SOCKET_RUNNING, SOCKET_LISTENING)) + return; + + /* We don't care for the service state if we are in Accept=yes mode */ + if (s->accept) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + socket_enter_stop_pre(s, SOCKET_FAILURE_SERVICE_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + if (IN_SET(SERVICE(other)->state, + SERVICE_DEAD, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED, SERVICE_FAILED_BEFORE_AUTO_RESTART, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED)) + socket_enter_listening(s); + + if (SERVICE(other)->state == SERVICE_RUNNING) + socket_set_state(s, SOCKET_RUNNING); +} + +static int socket_get_timeout(Unit *u, usec_t *timeout) { + Socket *s = SOCKET(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +char *socket_fdname(Socket *s) { + assert(s); + + /* Returns the name to use for $LISTEN_NAMES. If the user + * didn't specify anything specifically, use the socket unit's + * name as fallback. */ + + return s->fdname ?: UNIT(s)->id; +} + +static PidRef *socket_control_pid(Unit *u) { + return &ASSERT_PTR(SOCKET(u))->control_pid; +} + +static int socket_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(mask != 0); + + if (s->state != SOCKET_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + socket_unwatch_control_pid(s); + s->clean_result = SOCKET_SUCCESS; + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + r = socket_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to install timer: %m"); + goto fail; + } + + r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_CLEANING); + return 0; + +fail: + s->clean_result = SOCKET_FAILURE_RESOURCES; + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + return r; +} + +static int socket_can_clean(Unit *u, ExecCleanMask *ret) { + Socket *s = SOCKET(u); + + assert(s); + + return exec_context_get_clean_mask(&s->exec_context, ret); +} + +static int socket_can_start(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + + r = unit_test_start_limit(u); + if (r < 0) { + socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT); + return r; + } + + return 1; +} + +static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = { + [SOCKET_EXEC_START_PRE] = "ExecStartPre", + [SOCKET_EXEC_START_CHOWN] = "ExecStartChown", + [SOCKET_EXEC_START_POST] = "ExecStartPost", + [SOCKET_EXEC_STOP_PRE] = "ExecStopPre", + [SOCKET_EXEC_STOP_POST] = "ExecStopPost" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand); + +static const char* const socket_result_table[_SOCKET_RESULT_MAX] = { + [SOCKET_SUCCESS] = "success", + [SOCKET_FAILURE_RESOURCES] = "resources", + [SOCKET_FAILURE_TIMEOUT] = "timeout", + [SOCKET_FAILURE_EXIT_CODE] = "exit-code", + [SOCKET_FAILURE_SIGNAL] = "signal", + [SOCKET_FAILURE_CORE_DUMP] = "core-dump", + [SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit", + [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult); + +static const char* const socket_timestamping_table[_SOCKET_TIMESTAMPING_MAX] = { + [SOCKET_TIMESTAMPING_OFF] = "off", + [SOCKET_TIMESTAMPING_US] = "us", + [SOCKET_TIMESTAMPING_NS] = "ns", +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_timestamping, SocketTimestamping); + +SocketTimestamping socket_timestamping_from_string_harder(const char *p) { + SocketTimestamping t; + int r; + + if (!p) + return _SOCKET_TIMESTAMPING_INVALID; + + t = socket_timestamping_from_string(p); + if (t >= 0) + return t; + + /* Let's alternatively support the various other aliases parse_time() accepts for ns and µs here, + * too. */ + if (streq(p, "nsec")) + return SOCKET_TIMESTAMPING_NS; + if (STR_IN_SET(p, "usec", "µs", "μs")) /* Accept both small greek letter mu + micro sign unicode codepoints */ + return SOCKET_TIMESTAMPING_US; + + r = parse_boolean(p); + if (r < 0) + return _SOCKET_TIMESTAMPING_INVALID; + + return r ? SOCKET_TIMESTAMPING_NS : SOCKET_TIMESTAMPING_OFF; /* If boolean yes, default to ns accuracy */ +} + +const UnitVTable socket_vtable = { + .object_size = sizeof(Socket), + .exec_context_offset = offsetof(Socket, exec_context), + .cgroup_context_offset = offsetof(Socket, cgroup_context), + .kill_context_offset = offsetof(Socket, kill_context), + .exec_runtime_offset = offsetof(Socket, exec_runtime), + + .sections = + "Unit\0" + "Socket\0" + "Install\0", + .private_section = "Socket", + + .can_transient = true, + .can_trigger = true, + .can_fail = true, + + .init = socket_init, + .done = socket_done, + .load = socket_load, + + .coldplug = socket_coldplug, + + .dump = socket_dump, + + .start = socket_start, + .stop = socket_stop, + + .clean = socket_clean, + .can_clean = socket_can_clean, + + .get_timeout = socket_get_timeout, + + .serialize = socket_serialize, + .deserialize_item = socket_deserialize_item, + .distribute_fds = socket_distribute_fds, + + .active_state = socket_active_state, + .sub_state_to_string = socket_sub_state_to_string, + + .will_restart = unit_will_restart_default, + + .may_gc = socket_may_gc, + + .sigchld_event = socket_sigchld_event, + + .trigger_notify = socket_trigger_notify, + + .reset_failed = socket_reset_failed, + + .control_pid = socket_control_pid, + + .bus_set_property = bus_socket_set_property, + .bus_commit_properties = bus_socket_commit_properties, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Listening on %s.", + [JOB_FAILED] = "Failed to listen on %s.", + [JOB_TIMEOUT] = "Timed out starting %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Closed %s.", + [JOB_FAILED] = "Failed stopping %s.", + [JOB_TIMEOUT] = "Timed out stopping %s.", + }, + }, + + .can_start = socket_can_start, +}; diff --git a/src/core/socket.h b/src/core/socket.h new file mode 100644 index 0000000..0983e8c --- /dev/null +++ b/src/core/socket.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Socket Socket; +typedef struct SocketPeer SocketPeer; + +#include "mount.h" +#include "pidref.h" +#include "socket-util.h" +#include "unit.h" + +typedef enum SocketExecCommand { + SOCKET_EXEC_START_PRE, + SOCKET_EXEC_START_CHOWN, + SOCKET_EXEC_START_POST, + SOCKET_EXEC_STOP_PRE, + SOCKET_EXEC_STOP_POST, + _SOCKET_EXEC_COMMAND_MAX, + _SOCKET_EXEC_COMMAND_INVALID = -EINVAL, +} SocketExecCommand; + +typedef enum SocketType { + SOCKET_SOCKET, + SOCKET_FIFO, + SOCKET_SPECIAL, + SOCKET_MQUEUE, + SOCKET_USB_FUNCTION, + _SOCKET_TYPE_MAX, + _SOCKET_TYPE_INVALID = -EINVAL, +} SocketType; + +typedef enum SocketResult { + SOCKET_SUCCESS, + SOCKET_FAILURE_RESOURCES, + SOCKET_FAILURE_TIMEOUT, + SOCKET_FAILURE_EXIT_CODE, + SOCKET_FAILURE_SIGNAL, + SOCKET_FAILURE_CORE_DUMP, + SOCKET_FAILURE_START_LIMIT_HIT, + SOCKET_FAILURE_TRIGGER_LIMIT_HIT, + SOCKET_FAILURE_SERVICE_START_LIMIT_HIT, + _SOCKET_RESULT_MAX, + _SOCKET_RESULT_INVALID = -EINVAL, +} SocketResult; + +typedef struct SocketPort { + Socket *socket; + + SocketType type; + int fd; + int *auxiliary_fds; + size_t n_auxiliary_fds; + + SocketAddress address; + char *path; + sd_event_source *event_source; + + LIST_FIELDS(struct SocketPort, port); +} SocketPort; + +typedef enum SocketTimestamping { + SOCKET_TIMESTAMPING_OFF, + SOCKET_TIMESTAMPING_US, /* SO_TIMESTAMP */ + SOCKET_TIMESTAMPING_NS, /* SO_TIMESTAMPNS */ + _SOCKET_TIMESTAMPING_MAX, + _SOCKET_TIMESTAMPING_INVALID = -EINVAL, +} SocketTimestamping; + +struct Socket { + Unit meta; + + LIST_HEAD(SocketPort, ports); + + Set *peers_by_address; + + unsigned n_accepted; + unsigned n_connections; + unsigned n_refused; + unsigned max_connections; + unsigned max_connections_per_source; + + unsigned backlog; + unsigned keep_alive_cnt; + usec_t timeout_usec; + usec_t keep_alive_time; + usec_t keep_alive_interval; + usec_t defer_accept; + + ExecCommand* exec_command[_SOCKET_EXEC_COMMAND_MAX]; + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + + /* For Accept=no sockets refers to the one service we'll + * activate. For Accept=yes sockets is either NULL, or filled + * to refer to the next service we spawn. */ + UnitRef service; + + SocketState state, deserialized_state; + + sd_event_source *timer_event_source; + + ExecCommand* control_command; + SocketExecCommand control_command_id; + PidRef control_pid; + + mode_t directory_mode; + mode_t socket_mode; + + SocketResult result; + SocketResult clean_result; + + char **symlinks; + + bool accept; + bool remove_on_stop; + bool writable; + bool flush_pending; + + int socket_protocol; + + /* Socket options */ + bool keep_alive; + bool no_delay; + bool free_bind; + bool transparent; + bool broadcast; + bool pass_cred; + bool pass_sec; + bool pass_pktinfo; + SocketTimestamping timestamping; + + /* Only for INET6 sockets: issue IPV6_V6ONLY sockopt */ + SocketAddressBindIPv6Only bind_ipv6_only; + + int priority; + int mark; + size_t receive_buffer; + size_t send_buffer; + int ip_tos; + int ip_ttl; + size_t pipe_size; + char *bind_to_device; + char *tcp_congestion; + bool reuse_port; + long mq_maxmsg; + long mq_msgsize; + + char *smack; + char *smack_ip_in; + char *smack_ip_out; + + bool selinux_context_from_net; + + char *user, *group; + + char *fdname; + + RateLimit trigger_limit; + usec_t poll_limit_interval; + unsigned poll_limit_burst; +}; + +SocketPeer *socket_peer_ref(SocketPeer *p); +SocketPeer *socket_peer_unref(SocketPeer *p); +int socket_acquire_peer(Socket *s, int fd, SocketPeer **p); + +DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref); + +/* Called from the service code when collecting fds */ +int socket_collect_fds(Socket *s, int **fds); + +/* Called from the service code when a per-connection service ended */ +void socket_connection_unref(Socket *s); + +SocketPort *socket_port_free(SocketPort *p); +DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPort*, socket_port_free); + +void socket_free_ports(Socket *s); + +int socket_port_to_address(const SocketPort *s, char **ret); + +int socket_load_service_unit(Socket *s, int cfd, Unit **ret); + +char *socket_fdname(Socket *s); + +extern const UnitVTable socket_vtable; + +const char* socket_exec_command_to_string(SocketExecCommand i) _const_; +SocketExecCommand socket_exec_command_from_string(const char *s) _pure_; + +const char* socket_result_to_string(SocketResult i) _const_; +SocketResult socket_result_from_string(const char *s) _pure_; + +const char* socket_port_type_to_string(SocketPort *p) _pure_; +SocketType socket_port_type_from_string(const char *p) _pure_; + +const char* socket_timestamping_to_string(SocketTimestamping p) _const_; +SocketTimestamping socket_timestamping_from_string(const char *p) _pure_; +SocketTimestamping socket_timestamping_from_string_harder(const char *p) _pure_; + +DEFINE_CAST(SOCKET, Socket); diff --git a/src/core/swap.c b/src/core/swap.c new file mode 100644 index 0000000..488b171 --- /dev/null +++ b/src/core/swap.c @@ -0,0 +1,1680 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "dbus-swap.h" +#include "dbus-unit.h" +#include "device-util.h" +#include "device.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "fstab-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "swap.h" +#include "unit-name.h" +#include "unit.h" +#include "virt.h" + +static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = { + [SWAP_DEAD] = UNIT_INACTIVE, + [SWAP_ACTIVATING] = UNIT_ACTIVATING, + [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE, + [SWAP_ACTIVE] = UNIT_ACTIVE, + [SWAP_DEACTIVATING] = UNIT_DEACTIVATING, + [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING, + [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING, + [SWAP_FAILED] = UNIT_FAILED, + [SWAP_CLEANING] = UNIT_MAINTENANCE, +}; + +static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int swap_process_proc_swaps(Manager *m); + +static bool SWAP_STATE_WITH_PROCESS(SwapState state) { + return IN_SET(state, + SWAP_ACTIVATING, + SWAP_ACTIVATING_DONE, + SWAP_DEACTIVATING, + SWAP_DEACTIVATING_SIGTERM, + SWAP_DEACTIVATING_SIGKILL, + SWAP_CLEANING); +} + +static UnitActiveState swap_active_state(Unit *u) { + assert(u); + + return state_translation_table[SWAP(u)->state]; +} + +static const char *swap_sub_state_to_string(Unit *u) { + assert(u); + + return swap_state_to_string(SWAP(u)->state); +} + +static bool swap_may_gc(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + if (s->from_proc_swaps) + return false; + + return true; +} + +static bool swap_is_extrinsic(Unit *u) { + assert(SWAP(u)); + + return MANAGER_IS_USER(u->manager); +} + +static void swap_unset_proc_swaps(Swap *s) { + assert(s); + + if (!s->from_proc_swaps) + return; + + s->parameters_proc_swaps.what = mfree(s->parameters_proc_swaps.what); + s->from_proc_swaps = false; +} + +static int swap_set_devnode(Swap *s, const char *devnode) { + Hashmap *swaps; + Swap *first; + int r; + + assert(s); + + r = hashmap_ensure_allocated(&UNIT(s)->manager->swaps_by_devnode, &path_hash_ops); + if (r < 0) + return r; + + swaps = UNIT(s)->manager->swaps_by_devnode; + + if (s->devnode) { + first = hashmap_get(swaps, s->devnode); + + LIST_REMOVE(same_devnode, first, s); + if (first) + hashmap_replace(swaps, first->devnode, first); + else + hashmap_remove(swaps, s->devnode); + + s->devnode = mfree(s->devnode); + } + + if (devnode) { + s->devnode = strdup(devnode); + if (!s->devnode) + return -ENOMEM; + + first = hashmap_get(swaps, s->devnode); + LIST_PREPEND(same_devnode, first, s); + + return hashmap_replace(swaps, first->devnode, first); + } + + return 0; +} + +static void swap_init(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + assert(UNIT(s)->load_state == UNIT_STUB); + + s->timeout_usec = u->manager->defaults.timeout_start_usec; + + s->exec_context.std_output = u->manager->defaults.std_output; + s->exec_context.std_error = u->manager->defaults.std_error; + + s->control_pid = PIDREF_NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + + u->ignore_on_isolate = true; +} + +static void swap_unwatch_control_pid(Swap *s) { + assert(s); + + if (!pidref_is_set(&s->control_pid)) + return; + + unit_unwatch_pidref(UNIT(s), &s->control_pid); + pidref_done(&s->control_pid); +} + +static void swap_done(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + swap_unset_proc_swaps(s); + swap_set_devnode(s, NULL); + + s->what = mfree(s->what); + s->parameters_fragment.what = mfree(s->parameters_fragment.what); + s->parameters_fragment.options = mfree(s->parameters_fragment.options); + + s->exec_runtime = exec_runtime_free(s->exec_runtime); + exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); + s->control_command = NULL; + + swap_unwatch_control_pid(s); + + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); +} + +static int swap_arm_timer(Swap *s, bool relative, usec_t usec) { + assert(s); + + return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, swap_dispatch_timer); +} + +static SwapParameters* swap_get_parameters(Swap *s) { + assert(s); + + if (s->from_proc_swaps) + return &s->parameters_proc_swaps; + + if (s->from_fragment) + return &s->parameters_fragment; + + return NULL; +} + +static int swap_add_device_dependencies(Swap *s) { + UnitDependencyMask mask; + SwapParameters *p; + int r; + + assert(s); + + if (!s->what) + return 0; + + p = swap_get_parameters(s); + if (!p || !p->what) + return 0; + + mask = s->from_proc_swaps ? UNIT_DEPENDENCY_PROC_SWAP : UNIT_DEPENDENCY_FILE; + + if (is_device_path(p->what)) { + r = unit_add_node_dependency(UNIT(s), p->what, UNIT_REQUIRES, mask); + if (r < 0) + return r; + + return unit_add_blockdev_dependency(UNIT(s), p->what, mask); + } + + /* File based swap devices need to be ordered after systemd-remount-fs.service, since they might need + * a writable file system. */ + return unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, mask); +} + +static int swap_add_default_dependencies(Swap *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + if (!MANAGER_IS_SYSTEM(UNIT(s)->manager)) + return 0; + + if (detect_container() > 0) + return 0; + + /* swap units generated for the swap dev links are missing the + * ordering dep against the swap target. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SWAP_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int swap_verify(Swap *s) { + _cleanup_free_ char *e = NULL; + int r; + + assert(UNIT(s)->load_state == UNIT_LOADED); + + r = unit_name_from_path(s->what, ".swap", &e); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to generate unit name from path: %m"); + + if (!unit_has_name(UNIT(s), e)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Value of What= and unit name do not match, not loading."); + + if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing to load."); + + return 0; +} + +static int swap_load_devnode(Swap *s) { + _cleanup_free_ char *p = NULL; + struct stat st; + int r; + + assert(s); + + if (stat(s->what, &st) < 0 || !S_ISBLK(st.st_mode)) + return 0; + + r = devname_from_stat_rdev(&st, &p); + if (r < 0) { + log_unit_full_errno(UNIT(s), r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to get device node for swap %s: %m", s->what); + return 0; + } + + return swap_set_devnode(s, p); +} + +static int swap_add_extras(Swap *s) { + int r; + + assert(s); + + if (UNIT(s)->fragment_path) + s->from_fragment = true; + + if (!s->what) { + if (s->parameters_fragment.what) + s->what = strdup(s->parameters_fragment.what); + else if (s->parameters_proc_swaps.what) + s->what = strdup(s->parameters_proc_swaps.what); + else { + r = unit_name_to_path(UNIT(s)->id, &s->what); + if (r < 0) + return r; + } + + if (!s->what) + return -ENOMEM; + } + + path_simplify(s->what); + + if (!UNIT(s)->description) { + r = unit_set_description(UNIT(s), s->what); + if (r < 0) + return r; + } + + r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + + r = swap_add_device_dependencies(s); + if (r < 0) + return r; + + r = swap_load_devnode(s); + if (r < 0) + return r; + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(UNIT(s), &s->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + r = swap_add_default_dependencies(s); + if (r < 0) + return r; + + return 0; +} + +static int swap_load(Unit *u) { + Swap *s = SWAP(u); + int r, q = 0; + + assert(s); + assert(u->load_state == UNIT_STUB); + + /* Load a .swap file */ + bool fragment_optional = s->from_proc_swaps; + r = unit_load_fragment_and_dropin(u, !fragment_optional); + + /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is + * already active. */ + if (u->load_state == UNIT_LOADED || s->from_proc_swaps) + q = swap_add_extras(s); + + if (r < 0) + return r; + if (q < 0) + return q; + if (u->load_state != UNIT_LOADED) + return 0; + + return swap_verify(s); +} + +static int swap_setup_unit( + Manager *m, + const char *what, + const char *what_proc_swaps, + int priority, + bool set_flags) { + + _cleanup_free_ char *e = NULL; + bool delete = false; + Unit *u = NULL; + int r; + SwapParameters *p; + + assert(m); + assert(what); + assert(what_proc_swaps); + + r = unit_name_from_path(what, ".swap", &e); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m"); + + u = manager_get_unit(m, e); + if (u && + SWAP(u)->from_proc_swaps && + !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "Swap %s appeared twice with different device paths %s and %s", + e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps); + + if (!u) { + delete = true; + + r = unit_new_for_name(m, sizeof(Swap), e, &u); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to load swap unit: %m"); + goto fail; + } + + SWAP(u)->what = strdup(what); + if (!SWAP(u)->what) { + r = log_oom(); + goto fail; + } + + unit_add_to_load_queue(u); + } else + delete = false; + + p = &SWAP(u)->parameters_proc_swaps; + + if (!p->what) { + p->what = strdup(what_proc_swaps); + if (!p->what) { + r = log_oom(); + goto fail; + } + } + + /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be + * loaded. After all we can load it now, from the data in /proc/swaps. */ + if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + u->load_state = UNIT_LOADED; + u->load_error = 0; + } + + if (set_flags) { + SWAP(u)->is_active = true; + SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps; + } + + SWAP(u)->from_proc_swaps = true; + + p->priority = priority; + p->priority_set = true; + + unit_add_to_dbus_queue(u); + return 0; + +fail: + if (delete) + unit_free(u); + + return r; +} + +static void swap_process_new(Manager *m, const char *device, int prio, bool set_flags) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + const char *dn; + struct stat st, st_link; + int r; + + assert(m); + + if (swap_setup_unit(m, device, device, prio, set_flags) < 0) + return; + + /* If this is a block device, then let's add duplicates for + * all other names of this block device */ + if (stat(device, &st) < 0 || !S_ISBLK(st.st_mode)) + return; + + r = sd_device_new_from_stat_rdev(&d, &st); + if (r < 0) + return (void) log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to allocate device for swap %s: %m", device); + + /* Add the main device node */ + if (sd_device_get_devname(d, &dn) >= 0 && !streq(dn, device)) + (void) swap_setup_unit(m, dn, device, prio, set_flags); + + /* Add additional units for all symlinks */ + FOREACH_DEVICE_DEVLINK(d, devlink) { + + /* Don't bother with the /dev/block links */ + if (streq(devlink, device)) + continue; + + if (path_startswith(devlink, "/dev/block/")) + continue; + + if (stat(devlink, &st_link) >= 0 && + (!S_ISBLK(st_link.st_mode) || + st_link.st_rdev != st.st_rdev)) + continue; + + (void) swap_setup_unit(m, devlink, device, prio, set_flags); + } +} + +static void swap_set_state(Swap *s, SwapState state) { + SwapState old_state; + + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!SWAP_STATE_WITH_PROCESS(state)) { + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + swap_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + } + + if (state != old_state) + log_unit_debug(UNIT(s), "Changed %s -> %s", swap_state_to_string(old_state), swap_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); + + /* If there other units for the same device node have a job + queued it might be worth checking again if it is runnable + now. This is necessary, since swap_start() refuses + operation with EAGAIN if there's already another job for + the same device node queued. */ + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job) + job_add_to_run_queue(UNIT(other)->job); +} + +static int swap_coldplug(Unit *u) { + Swap *s = SWAP(u); + SwapState new_state = SWAP_DEAD; + int r; + + assert(s); + assert(s->state == SWAP_DEAD); + + if (s->deserialized_state != s->state) + new_state = s->deserialized_state; + else if (s->from_proc_swaps) + new_state = SWAP_ACTIVE; + + if (new_state == s->state) + return 0; + + if (pidref_is_set(&s->control_pid) && + pidref_is_unwaited(&s->control_pid) > 0 && + SWAP_STATE_WITH_PROCESS(new_state)) { + + r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false); + if (r < 0) + return r; + + r = swap_arm_timer(s, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) + (void) unit_setup_exec_runtime(u); + + swap_set_state(s, new_state); + return 0; +} + +static void swap_dump(Unit *u, FILE *f, const char *prefix) { + Swap *s = SWAP(u); + SwapParameters *p; + + assert(s); + assert(f); + + if (s->from_proc_swaps) + p = &s->parameters_proc_swaps; + else if (s->from_fragment) + p = &s->parameters_fragment; + else + p = NULL; + + fprintf(f, + "%sSwap State: %s\n" + "%sResult: %s\n" + "%sClean Result: %s\n" + "%sWhat: %s\n" + "%sFrom /proc/swaps: %s\n" + "%sFrom fragment: %s\n" + "%sExtrinsic: %s\n", + prefix, swap_state_to_string(s->state), + prefix, swap_result_to_string(s->result), + prefix, swap_result_to_string(s->clean_result), + prefix, s->what, + prefix, yes_no(s->from_proc_swaps), + prefix, yes_no(s->from_fragment), + prefix, yes_no(swap_is_extrinsic(u))); + + if (s->devnode) + fprintf(f, "%sDevice Node: %s\n", prefix, s->devnode); + + if (p) + fprintf(f, + "%sPriority: %i\n" + "%sOptions: %s\n", + prefix, p->priority, + prefix, strempty(p->options)); + + fprintf(f, + "%sTimeoutSec: %s\n", + prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC)); + + if (pidref_is_set(&s->control_pid)) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid.pid); + + exec_context_dump(&s->exec_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); + cgroup_context_dump(UNIT(s), f, prefix); +} + +static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) { + + _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT( + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN); + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(ret_pid); + + r = unit_prepare_exec(UNIT(s)); + if (r < 0) + return r; + + r = swap_arm_timer(s, /* relative= */ true, s->timeout_usec); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->cgroup_context, + &pid); + if (r < 0) + return r; + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return r; + + r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true); + if (r < 0) + return r; + + *ret_pid = TAKE_PIDREF(pidref); + return 0; +} + +static void swap_enter_dead(Swap *s, SwapResult f) { + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SWAP_SUCCESS, swap_result_to_string(s->result)); + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop); + swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD); + + s->exec_runtime = exec_runtime_destroy(s->exec_runtime); + + unit_destroy_runtime_data(UNIT(s), &s->exec_context); + + unit_unref_uid_gid(UNIT(s), true); +} + +static void swap_enter_active(Swap *s, SwapResult f) { + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + swap_set_state(s, SWAP_ACTIVE); +} + +static void swap_enter_dead_or_active(Swap *s, SwapResult f) { + assert(s); + + if (s->from_proc_swaps) { + swap_enter_active(s, f); + + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job) + swap_enter_dead_or_active(other, f); + } else + swap_enter_dead(s, f); +} + +static int state_to_kill_operation(Swap *s, SwapState state) { + if (state == SWAP_DEACTIVATING_SIGTERM) { + if (unit_has_job_type(UNIT(s), JOB_RESTART)) + return KILL_RESTART; + else + return KILL_TERMINATE; + } + + return KILL_KILL; +} + +static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) { + int r; + + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state_to_kill_operation(s, state), + /* main_pid= */ NULL, + &s->control_pid, + /* main_pid_alien= */ false); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + goto fail; + } + + if (r > 0) { + r = swap_arm_timer(s, /* relative= */ true, s->timeout_usec); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m"); + goto fail; + } + + swap_set_state(s, state); + } else if (state == SWAP_DEACTIVATING_SIGTERM && s->kill_context.send_sigkill) + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS); + else + swap_enter_dead_or_active(s, SWAP_SUCCESS); + + return; + +fail: + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_enter_activating(Swap *s) { + _cleanup_free_ char *opts = NULL; + int r; + + assert(s); + + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start); + + s->control_command_id = SWAP_EXEC_ACTIVATE; + s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE; + + if (s->from_fragment) { + int priority = 0; + + r = fstab_find_pri(s->parameters_fragment.options, &priority); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to parse swap priority \"%s\", ignoring: %m", s->parameters_fragment.options); + else if (r > 0 && s->parameters_fragment.priority_set) + log_unit_warning(UNIT(s), "Duplicate swap priority configuration by Priority= and Options= fields."); + + if (r <= 0 && s->parameters_fragment.priority_set) { + if (s->parameters_fragment.options) + r = asprintf(&opts, "%s,pri=%i", s->parameters_fragment.options, s->parameters_fragment.priority); + else + r = asprintf(&opts, "pri=%i", s->parameters_fragment.priority); + if (r < 0) { + r = log_oom(); + goto fail; + } + } + } + + r = exec_command_set(s->control_command, "/sbin/swapon", "--fixpgsz", NULL); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to initialize swapon command line: %m"); + goto fail; + } + + if (s->parameters_fragment.options || opts) { + r = exec_command_append(s->control_command, "-o", + opts ?: s->parameters_fragment.options, NULL); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapon command line: %m"); + goto fail; + } + } + + r = exec_command_append(s->control_command, s->what, NULL); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapon command line: %m"); + goto fail; + } + + swap_unwatch_control_pid(s); + + r = swap_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'swapon' task: %m"); + goto fail; + } + + swap_set_state(s, SWAP_ACTIVATING); + return; + +fail: + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_enter_deactivating(Swap *s) { + int r; + + assert(s); + + s->control_command_id = SWAP_EXEC_DEACTIVATE; + s->control_command = s->exec_command + SWAP_EXEC_DEACTIVATE; + + r = exec_command_set(s->control_command, + "/sbin/swapoff", + s->what, + NULL); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapoff command line: %m"); + goto fail; + } + + swap_unwatch_control_pid(s); + + r = swap_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'swapoff' task: %m"); + goto fail; + } + + swap_set_state(s, SWAP_DEACTIVATING); + return; + +fail: + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_cycle_clear(Swap *s) { + assert(s); + + s->result = SWAP_SUCCESS; + exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); + UNIT(s)->reset_accounting = true; +} + +static int swap_start(Unit *u) { + Swap *s = SWAP(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later please! */ + if (IN_SET(s->state, + SWAP_DEACTIVATING, + SWAP_DEACTIVATING_SIGTERM, + SWAP_DEACTIVATING_SIGKILL, + SWAP_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (s->state == SWAP_ACTIVATING) + return 0; + + assert(IN_SET(s->state, SWAP_DEAD, SWAP_FAILED)); + + if (detect_container() > 0) + return -EPERM; + + /* If there's a job for another swap unit for the same node + * running, then let's not dispatch this one for now, and wait + * until that other job has finished. */ + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job && UNIT(other)->job->state == JOB_RUNNING) + return -EAGAIN; + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + swap_cycle_clear(s); + swap_enter_activating(s); + return 1; +} + +static int swap_stop(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + switch (s->state) { + + case SWAP_DEACTIVATING: + case SWAP_DEACTIVATING_SIGTERM: + case SWAP_DEACTIVATING_SIGKILL: + /* Already on it */ + return 0; + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + /* There's a control process pending, directly enter kill mode */ + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_SUCCESS); + return 0; + + case SWAP_ACTIVE: + if (detect_container() > 0) + return -EPERM; + + swap_enter_deactivating(s); + return 1; + + case SWAP_CLEANING: + /* If we are currently cleaning, then abort it, brutally. */ + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS); + return 0; + + default: + assert_not_reached(); + } +} + +static int swap_serialize(Unit *u, FILE *f, FDSet *fds) { + Swap *s = SWAP(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", swap_state_to_string(s->state)); + (void) serialize_item(f, "result", swap_result_to_string(s->result)); + (void) serialize_pidref(f, fds, "control-pid", &s->control_pid); + + if (s->control_command_id >= 0) + (void) serialize_item(f, "control-command", swap_exec_command_to_string(s->control_command_id)); + + return 0; +} + +static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Swap *s = SWAP(u); + + assert(s); + assert(fds); + + if (streq(key, "state")) { + SwapState state; + + state = swap_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + SwapResult f; + + f = swap_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SWAP_SUCCESS) + s->result = f; + } else if (streq(key, "control-pid")) { + + pidref_done(&s->control_pid); + (void) deserialize_pidref(fds, value, &s->control_pid); + + } else if (streq(key, "control-command")) { + SwapExecCommand id; + + id = swap_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + s->control_command_id = id; + s->control_command = s->exec_command + id; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Swap *s = SWAP(u); + SwapResult f; + + assert(s); + assert(pid >= 0); + + if (pid != s->control_pid.pid) + return; + + /* Let's scan /proc/swaps before we process SIGCHLD. For the reasoning see the similar code in + * mount.c */ + (void) swap_process_proc_swaps(u->manager); + + pidref_done(&s->control_pid); + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = SWAP_SUCCESS; + else if (code == CLD_EXITED) + f = SWAP_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SWAP_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SWAP_FAILURE_CORE_DUMP; + else + assert_not_reached(); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + } + + unit_log_process_exit( + u, + "Swap process", + swap_exec_command_to_string(s->control_command_id), + f == SWAP_SUCCESS, + code, status); + + switch (s->state) { + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + + if (f == SWAP_SUCCESS || s->from_proc_swaps) + swap_enter_active(s, f); + else + swap_enter_dead(s, f); + break; + + case SWAP_DEACTIVATING: + case SWAP_DEACTIVATING_SIGKILL: + case SWAP_DEACTIVATING_SIGTERM: + + swap_enter_dead_or_active(s, f); + break; + + case SWAP_CLEANING: + if (s->clean_result == SWAP_SUCCESS) + s->clean_result = f; + + swap_enter_dead(s, SWAP_SUCCESS); + break; + + default: + assert_not_reached(); + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Swap *s = SWAP(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + log_unit_warning(UNIT(s), "Activation timed out. Stopping."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_DEACTIVATING: + log_unit_warning(UNIT(s), "Deactivation timed out. Stopping."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_DEACTIVATING_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Swap process timed out. Killing."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Swap process timed out. Skipping SIGKILL. Ignoring."); + swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT); + } + break; + + case SWAP_DEACTIVATING_SIGKILL: + log_unit_warning(UNIT(s), "Swap process still around after SIGKILL. Ignoring."); + swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_CLEANING: + log_unit_warning(UNIT(s), "Cleaning timed out. killing."); + + if (s->clean_result == SWAP_SUCCESS) + s->clean_result = SWAP_FAILURE_TIMEOUT; + + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, 0); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int swap_load_proc_swaps(Manager *m, bool set_flags) { + assert(m); + + rewind(m->proc_swaps); + + (void) fscanf(m->proc_swaps, "%*s %*s %*s %*s %*s\n"); + + for (unsigned i = 1;; i++) { + _cleanup_free_ char *dev = NULL, *d = NULL; + int prio = 0, k; + + k = fscanf(m->proc_swaps, + "%ms " /* device/file */ + "%*s " /* type of swap */ + "%*s " /* swap size */ + "%*s " /* used */ + "%i\n", /* priority */ + &dev, &prio); + if (k != 2) { + if (k == EOF) + break; + + log_warning("Failed to parse /proc/swaps:%u, skipping.", i); + continue; + } + + ssize_t l = cunescape(dev, UNESCAPE_RELAX, &d); + if (l < 0) + return log_error_errno(l, "Failed to unescape device path: %m"); + + device_found_node(m, d, DEVICE_FOUND_SWAP, DEVICE_FOUND_SWAP); + + (void) swap_process_new(m, d, prio, set_flags); + } + + return 0; +} + +static int swap_process_proc_swaps(Manager *m) { + int r; + + assert(m); + + r = swap_load_proc_swaps(m, true); + if (r < 0) { + /* Reset flags, just in case, for late calls */ + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) { + Swap *swap = SWAP(u); + + assert(swap); + + swap->is_active = swap->just_activated = false; + } + + return 0; + } + + manager_dispatch_load_queue(m); + + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) { + Swap *swap = SWAP(u); + + assert(swap); + + if (!swap->is_active) { + + swap_unset_proc_swaps(swap); + + switch (swap->state) { + + case SWAP_ACTIVE: + /* This has just been deactivated */ + swap_enter_dead(swap, SWAP_SUCCESS); + break; + + default: + /* Fire again */ + swap_set_state(swap, swap->state); + break; + } + + if (swap->what) + device_found_node(m, swap->what, DEVICE_NOT_FOUND, DEVICE_FOUND_SWAP); + + } else if (swap->just_activated) { + + /* New swap entry */ + + switch (swap->state) { + + case SWAP_DEAD: + case SWAP_FAILED: + (void) unit_acquire_invocation_id(u); + swap_cycle_clear(swap); + swap_enter_active(swap, SWAP_SUCCESS); + break; + + case SWAP_ACTIVATING: + swap_set_state(swap, SWAP_ACTIVATING_DONE); + break; + + default: + /* Nothing really changed, but let's + * issue an notification call + * nonetheless, in case somebody is + * waiting for this. */ + swap_set_state(swap, swap->state); + break; + } + } + + /* Reset the flags for later calls */ + swap->is_active = swap->just_activated = false; + } + + return 1; +} + +static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(revents & EPOLLPRI); + + return swap_process_proc_swaps(m); +} + +static Unit *swap_following(Unit *u) { + Swap *s = SWAP(u); + Swap *first = NULL; + + assert(s); + + /* If the user configured the swap through /etc/fstab or + * a device unit, follow that. */ + + if (s->from_fragment) + return NULL; + + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (other->from_fragment) + return UNIT(other); + + /* Otherwise, make everybody follow the unit that's named after + * the swap device in the kernel */ + + if (streq_ptr(s->what, s->devnode)) + return NULL; + + LIST_FOREACH(same_devnode, other, s->same_devnode_next) + if (streq_ptr(other->what, other->devnode)) + return UNIT(other); + + LIST_FOREACH_BACKWARDS(same_devnode, other, s->same_devnode_prev) { + if (streq_ptr(other->what, other->devnode)) + return UNIT(other); + + first = other; + } + + /* Fall back to the first on the list */ + return UNIT(first); +} + +static int swap_following_set(Unit *u, Set **_set) { + Swap *s = SWAP(u); + _cleanup_set_free_ Set *set = NULL; + int r; + + assert(s); + assert(_set); + + if (LIST_JUST_US(same_devnode, s)) { + *_set = NULL; + return 0; + } + + set = set_new(NULL); + if (!set) + return -ENOMEM; + + LIST_FOREACH_OTHERS(same_devnode, other, s) { + r = set_put(set, other); + if (r < 0) + return r; + } + + *_set = TAKE_PTR(set); + return 1; +} + +static void swap_shutdown(Manager *m) { + assert(m); + + m->swap_event_source = sd_event_source_disable_unref(m->swap_event_source); + m->proc_swaps = safe_fclose(m->proc_swaps); + m->swaps_by_devnode = hashmap_free(m->swaps_by_devnode); +} + +static void swap_enumerate(Manager *m) { + int r; + + assert(m); + + if (!m->proc_swaps) { + m->proc_swaps = fopen("/proc/swaps", "re"); + if (!m->proc_swaps) { + if (errno == ENOENT) + log_debug_errno(errno, "Not swap enabled, skipping enumeration."); + else + log_warning_errno(errno, "Failed to open /proc/swaps, ignoring: %m"); + + return; + } + + r = sd_event_add_io(m->event, &m->swap_event_source, fileno(m->proc_swaps), EPOLLPRI, swap_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to watch /proc/swaps: %m"); + goto fail; + } + + /* Dispatch this before we dispatch SIGCHLD, so that + * we always get the events from /proc/swaps before + * the SIGCHLD of /sbin/swapon. */ + r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) { + log_error_errno(r, "Failed to change /proc/swaps priority: %m"); + goto fail; + } + + (void) sd_event_source_set_description(m->swap_event_source, "swap-proc"); + } + + r = swap_load_proc_swaps(m, false); + if (r < 0) + goto fail; + + return; + +fail: + swap_shutdown(m); +} + +int swap_process_device_new(Manager *m, sd_device *dev) { + _cleanup_free_ char *e = NULL; + const char *dn; + Unit *u; + int r; + + assert(m); + assert(dev); + + if (sd_device_get_devname(dev, &dn) < 0) + return 0; + + r = unit_name_from_path(dn, ".swap", &e); + if (r < 0) { + log_debug_errno(r, "Cannot convert device name '%s' to unit name, ignoring: %m", dn); + return 0; + } + + u = manager_get_unit(m, e); + if (u) + r = swap_set_devnode(SWAP(u), dn); + + FOREACH_DEVICE_DEVLINK(dev, devlink) { + _cleanup_free_ char *n = NULL; + int q; + + q = unit_name_from_path(devlink, ".swap", &n); + if (q == -EINVAL) /* If the name is not convertible to unit name, we can't manage it */ + continue; + if (q < 0) + return q; + + u = manager_get_unit(m, n); + if (u) { + q = swap_set_devnode(SWAP(u), dn); + if (q < 0) + r = q; + } + } + + return r; +} + +int swap_process_device_remove(Manager *m, sd_device *dev) { + const char *dn; + int r; + Swap *s; + + r = sd_device_get_devname(dev, &dn); + if (r < 0) + return 0; + + while ((s = hashmap_get(m->swaps_by_devnode, dn))) { + int q; + + q = swap_set_devnode(s, NULL); + if (q < 0) + r = q; + } + + return r; +} + +static void swap_reset_failed(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + if (s->state == SWAP_FAILED) + swap_set_state(s, SWAP_DEAD); + + s->result = SWAP_SUCCESS; + s->clean_result = SWAP_SUCCESS; +} + +static int swap_get_timeout(Unit *u, usec_t *timeout) { + Swap *s = SWAP(u); + usec_t t; + int r; + + assert(s); + assert(u); + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static bool swap_supported(void) { + static int supported = -1; + + /* If swap support is not available in the kernel, or we are + * running in a container we don't support swap units, and any + * attempts to starting one should fail immediately. */ + + if (supported < 0) + supported = + access("/proc/swaps", F_OK) >= 0 && + detect_container() <= 0; + + return supported; +} + +static PidRef* swap_control_pid(Unit *u) { + return &ASSERT_PTR(SWAP(u))->control_pid; +} + +static int swap_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Swap *s = SWAP(u); + int r; + + assert(s); + assert(mask != 0); + + if (s->state != SWAP_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + swap_unwatch_control_pid(s); + s->clean_result = SWAP_SUCCESS; + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + + r = swap_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to install timer: %m"); + goto fail; + } + + r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m"); + goto fail; + } + + swap_set_state(s, SWAP_CLEANING); + return 0; + +fail: + s->clean_result = SWAP_FAILURE_RESOURCES; + s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); + return r; +} + +static int swap_can_clean(Unit *u, ExecCleanMask *ret) { + Swap *s = SWAP(u); + + assert(s); + + return exec_context_get_clean_mask(&s->exec_context, ret); +} + +static int swap_can_start(Unit *u) { + Swap *s = SWAP(u); + int r; + + assert(s); + + r = unit_test_start_limit(u); + if (r < 0) { + swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT); + return r; + } + + return 1; +} + +int swap_get_priority(const Swap *s) { + assert(s); + + if (s->from_proc_swaps && s->parameters_proc_swaps.priority_set) + return s->parameters_proc_swaps.priority; + + if (s->from_fragment && s->parameters_fragment.priority_set) + return s->parameters_fragment.priority; + + return -1; +} + +const char* swap_get_options(const Swap *s) { + assert(s); + + if (s->from_fragment) + return s->parameters_fragment.options; + + return NULL; +} + +static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = { + [SWAP_EXEC_ACTIVATE] = "ExecActivate", + [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate", +}; + +DEFINE_STRING_TABLE_LOOKUP(swap_exec_command, SwapExecCommand); + +static const char* const swap_result_table[_SWAP_RESULT_MAX] = { + [SWAP_SUCCESS] = "success", + [SWAP_FAILURE_RESOURCES] = "resources", + [SWAP_FAILURE_TIMEOUT] = "timeout", + [SWAP_FAILURE_EXIT_CODE] = "exit-code", + [SWAP_FAILURE_SIGNAL] = "signal", + [SWAP_FAILURE_CORE_DUMP] = "core-dump", + [SWAP_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(swap_result, SwapResult); + +const UnitVTable swap_vtable = { + .object_size = sizeof(Swap), + .exec_context_offset = offsetof(Swap, exec_context), + .cgroup_context_offset = offsetof(Swap, cgroup_context), + .kill_context_offset = offsetof(Swap, kill_context), + .exec_runtime_offset = offsetof(Swap, exec_runtime), + + .sections = + "Unit\0" + "Swap\0" + "Install\0", + .private_section = "Swap", + + .can_fail = true, + + .init = swap_init, + .load = swap_load, + .done = swap_done, + + .coldplug = swap_coldplug, + + .dump = swap_dump, + + .start = swap_start, + .stop = swap_stop, + + .clean = swap_clean, + .can_clean = swap_can_clean, + + .get_timeout = swap_get_timeout, + + .serialize = swap_serialize, + .deserialize_item = swap_deserialize_item, + + .active_state = swap_active_state, + .sub_state_to_string = swap_sub_state_to_string, + + .will_restart = unit_will_restart_default, + + .may_gc = swap_may_gc, + .is_extrinsic = swap_is_extrinsic, + + .sigchld_event = swap_sigchld_event, + + .reset_failed = swap_reset_failed, + + .control_pid = swap_control_pid, + + .bus_set_property = bus_swap_set_property, + .bus_commit_properties = bus_swap_commit_properties, + + .following = swap_following, + .following_set = swap_following_set, + + .enumerate = swap_enumerate, + .shutdown = swap_shutdown, + .supported = swap_supported, + + .status_message_formats = { + .starting_stopping = { + [0] = "Activating swap %s...", + [1] = "Deactivating swap %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Activated swap %s.", + [JOB_FAILED] = "Failed to activate swap %s.", + [JOB_TIMEOUT] = "Timed out activating swap %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Deactivated swap %s.", + [JOB_FAILED] = "Failed deactivating swap %s.", + [JOB_TIMEOUT] = "Timed out deactivating swap %s.", + }, + }, + + .can_start = swap_can_start, + + .notify_plymouth = true, +}; diff --git a/src/core/swap.h b/src/core/swap.h new file mode 100644 index 0000000..ef20f0f --- /dev/null +++ b/src/core/swap.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "sd-device.h" + +#include "pidref.h" +#include "unit.h" + +typedef struct Swap Swap; + +typedef enum SwapExecCommand { + SWAP_EXEC_ACTIVATE, + SWAP_EXEC_DEACTIVATE, + _SWAP_EXEC_COMMAND_MAX, + _SWAP_EXEC_COMMAND_INVALID = -EINVAL, +} SwapExecCommand; + +typedef enum SwapResult { + SWAP_SUCCESS, + SWAP_FAILURE_RESOURCES, + SWAP_FAILURE_TIMEOUT, + SWAP_FAILURE_EXIT_CODE, + SWAP_FAILURE_SIGNAL, + SWAP_FAILURE_CORE_DUMP, + SWAP_FAILURE_START_LIMIT_HIT, + _SWAP_RESULT_MAX, + _SWAP_RESULT_INVALID = -EINVAL, +} SwapResult; + +typedef struct SwapParameters { + char *what; + char *options; + int priority; + bool priority_set; +} SwapParameters; + +struct Swap { + Unit meta; + + char *what; + + /* If the device has already shown up, this is the device + * node, which might be different from what, due to + * symlinks */ + char *devnode; + + SwapParameters parameters_proc_swaps; + SwapParameters parameters_fragment; + + bool from_proc_swaps:1; + bool from_fragment:1; + + /* Used while looking for swaps that vanished or got added + * from/to /proc/swaps */ + bool is_active:1; + bool just_activated:1; + + SwapResult result; + SwapResult clean_result; + + usec_t timeout_usec; + + ExecCommand exec_command[_SWAP_EXEC_COMMAND_MAX]; + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + + SwapState state, deserialized_state; + + ExecCommand* control_command; + SwapExecCommand control_command_id; + PidRef control_pid; + + sd_event_source *timer_event_source; + + /* In order to be able to distinguish dependencies on + different device nodes we might end up creating multiple + devices for the same swap. We chain them up here. */ + + LIST_FIELDS(struct Swap, same_devnode); +}; + +extern const UnitVTable swap_vtable; + +int swap_process_device_new(Manager *m, sd_device *dev); +int swap_process_device_remove(Manager *m, sd_device *dev); + +int swap_get_priority(const Swap *s); +const char* swap_get_options(const Swap *s); + +const char* swap_exec_command_to_string(SwapExecCommand i) _const_; +SwapExecCommand swap_exec_command_from_string(const char *s) _pure_; + +const char* swap_result_to_string(SwapResult i) _const_; +SwapResult swap_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SWAP, Swap); diff --git a/src/core/system.conf.in b/src/core/system.conf.in new file mode 100644 index 0000000..05eb681 --- /dev/null +++ b/src/core/system.conf.in @@ -0,0 +1,83 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# /etc/systemd/system.conf.d/ directory. The latter is generally recommended. +# Defaults can be restored by simply deleting the main configuration file and +# all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/system.conf' to display the full config. +# +# See systemd-system.conf(5) for details. + +[Manager] +#LogLevel=info +#LogTarget=journal-or-kmsg +#LogColor=yes +#LogLocation=no +#LogTime=no +#DumpCore=yes +#ShowStatus=yes +#CrashChangeVT=no +#CrashShell=no +#CrashReboot=no +#CtrlAltDelBurstAction=reboot-force +#CPUAffinity= +#NUMAPolicy=default +#NUMAMask= +#RuntimeWatchdogSec=off +#RuntimeWatchdogPreSec=off +#RuntimeWatchdogPreGovernor= +#RebootWatchdogSec=10min +#KExecWatchdogSec=off +#WatchdogDevice= +#CapabilityBoundingSet= +#NoNewPrivileges=no +#SystemCallArchitectures= +#TimerSlackNSec= +#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}} +#DefaultTimerAccuracySec=1min +#DefaultStandardOutput=journal +#DefaultStandardError=inherit +#DefaultTimeoutStartSec={{DEFAULT_TIMEOUT_SEC}}s +#DefaultTimeoutStopSec={{DEFAULT_TIMEOUT_SEC}}s +#DefaultTimeoutAbortSec= +#DefaultDeviceTimeoutSec={{DEFAULT_TIMEOUT_SEC}}s +#DefaultRestartSec=100ms +#DefaultStartLimitIntervalSec=10s +#DefaultStartLimitBurst=5 +#DefaultEnvironment= +#DefaultCPUAccounting=yes +#DefaultIOAccounting=no +#DefaultIPAccounting=no +#DefaultMemoryAccounting={{ 'yes' if MEMORY_ACCOUNTING_DEFAULT else 'no' }} +#DefaultTasksAccounting=yes +#DefaultTasksMax=15% +#DefaultLimitCPU= +#DefaultLimitFSIZE= +#DefaultLimitDATA= +#DefaultLimitSTACK= +#DefaultLimitCORE= +#DefaultLimitRSS= +#DefaultLimitNOFILE=1024:{{HIGH_RLIMIT_NOFILE}} +#DefaultLimitAS= +#DefaultLimitNPROC= +#DefaultLimitMEMLOCK=8M +#DefaultLimitLOCKS= +#DefaultLimitSIGPENDING= +#DefaultLimitMSGQUEUE= +#DefaultLimitNICE= +#DefaultLimitRTPRIO= +#DefaultLimitRTTIME= +#DefaultMemoryPressureThresholdSec=200ms +#DefaultMemoryPressureWatch=auto +#DefaultOOMPolicy=stop +#DefaultSmackProcessLabel= +#ReloadLimitIntervalSec= +#ReloadLimitBurst= diff --git a/src/core/systemd.pc.in b/src/core/systemd.pc.in new file mode 100644 index 0000000..f3b85b0 --- /dev/null +++ b/src/core/systemd.pc.in @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +# Names with prefixes are preferred, and the run-together names should be +# considered deprecated (though there is no plan to remove them). New names +# shall have underscores. + +# root_prefix and rootprefix are deprecated since we dropped support for split-usr +# however we used to install units in root_prefix and a lot of downstream software +# overrode this variable in their build system to support installing units elsewhere. +# To stop those builds from silently breaking we keep root_prefix around but have +# it as an alias for prefix +root_prefix={{PREFIX_NOSLASH}} +rootprefix=${root_prefix} +prefix=${rootprefix} +sysconf_dir={{SYSCONF_DIR}} +sysconfdir=${sysconf_dir} + +systemd_util_dir=${prefix}/lib/systemd +systemdutildir=${systemd_util_dir} + +systemd_system_unit_dir=${prefix}/lib/systemd/system +systemdsystemunitdir=${systemd_system_unit_dir} + +systemd_system_preset_dir=${prefix}/lib/systemd/system-preset +systemdsystempresetdir=${systemd_system_preset_dir} + +systemd_user_unit_dir=${prefix}/lib/systemd/user +systemduserunitdir=${systemd_user_unit_dir} + +systemd_user_preset_dir=${prefix}/lib/systemd/user-preset +systemduserpresetdir=${systemd_user_preset_dir} + +systemd_system_conf_dir=${sysconfdir}/systemd/system +systemdsystemconfdir=${systemd_system_conf_dir} + +systemd_user_conf_dir=${sysconfdir}/systemd/user +systemduserconfdir=${systemd_user_conf_dir} + +systemd_system_unit_path=${systemd_system_conf_dir}:/etc/systemd/system:/run/systemd/system:/usr/local/lib/systemd/system:${systemd_system_unit_dir}:/usr/lib/systemd/system:/lib/systemd/system +systemdsystemunitpath=${systemd_system_unit_path} + +systemd_user_unit_path=${systemd_user_conf_dir}:/etc/systemd/user:/run/systemd/user:/usr/local/lib/systemd/user:/usr/local/share/systemd/user:${systemd_user_unit_dir}:/usr/lib/systemd/user:/usr/share/systemd/user +systemduserunitpath=${systemd_user_unit_path} + +systemd_system_generator_dir=${prefix}/lib/systemd/system-generators +systemdsystemgeneratordir=${systemd_system_generator_dir} + +systemd_user_generator_dir=${prefix}/lib/systemd/user-generators +systemdusergeneratordir=${systemd_user_generator_dir} + +systemd_system_generator_path=/run/systemd/system-generators:/etc/systemd/system-generators:/usr/local/lib/systemd/system-generators:${systemd_system_generator_dir} +systemdsystemgeneratorpath=${systemd_system_generator_path} + +systemd_user_generator_path=/run/systemd/user-generators:/etc/systemd/user-generators:/usr/local/lib/systemd/user-generators:${systemd_user_generator_dir} +systemdusergeneratorpath=${systemd_user_generator_path} + +systemd_sleep_dir=${prefix}/lib/systemd/system-sleep +systemdsleepdir=${systemd_sleep_dir} + +systemd_shutdown_dir=${prefix}/lib/systemd/system-shutdown +systemdshutdowndir=${systemd_shutdown_dir} + +tmpfiles_dir=${prefix}/lib/tmpfiles.d +tmpfilesdir=${tmpfiles_dir} + +user_tmpfiles_dir=${prefix}/share/user-tmpfiles.d + +sysusers_dir=${prefix}/lib/sysusers.d +sysusersdir=${sysusers_dir} + +sysctl_dir=${prefix}/lib/sysctl.d +sysctldir=${sysctl_dir} + +binfmt_dir=${prefix}/lib/binfmt.d +binfmtdir=${binfmt_dir} + +modules_load_dir=${prefix}/lib/modules-load.d +modulesloaddir=${modules_load_dir} + +catalog_dir=${prefix}/lib/systemd/catalog +catalogdir=${catalog_dir} + +system_uid_max={{SYSTEM_UID_MAX}} +systemuidmax=${system_uid_max} +system_gid_max={{SYSTEM_GID_MAX}} +systemgidmax=${system_gid_max} + +dynamic_uid_min={{DYNAMIC_UID_MIN}} +dynamicuidmin=${dynamic_uid_min} +dynamic_uid_max={{DYNAMIC_UID_MAX}} +dynamicuidmax=${dynamic_uid_max} + +container_uid_base_min={{CONTAINER_UID_BASE_MIN}} +containeruidbasemin=${container_uid_base_min} +container_uid_base_max={{CONTAINER_UID_BASE_MAX}} +containeruidbasemax=${container_uid_base_max} + +Name: systemd +Description: systemd System and Service Manager +URL: {{PROJECT_URL}} +Version: {{PROJECT_VERSION}} diff --git a/src/core/target.c b/src/core/target.c new file mode 100644 index 0000000..8f2a331 --- /dev/null +++ b/src/core/target.c @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-target.h" +#include "dbus-unit.h" +#include "log.h" +#include "serialize.h" +#include "special.h" +#include "string-util.h" +#include "target.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = { + [TARGET_DEAD] = UNIT_INACTIVE, + [TARGET_ACTIVE] = UNIT_ACTIVE +}; + +static void target_set_state(Target *t, TargetState state) { + TargetState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != old_state) + log_debug("%s changed %s -> %s", + UNIT(t)->id, + target_state_to_string(old_state), + target_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static int target_add_default_dependencies(Target *t) { + _cleanup_free_ Unit **others = NULL; + int r, n_others; + + assert(t); + + if (!UNIT(t)->default_dependencies) + return 0; + + /* Imply ordering for requirement dependencies on target units. Note that when the user created a + * contradicting ordering manually we won't add anything in here to make sure we don't create a + * loop. + * + * Note that quite likely iterating through these dependencies will add new dependencies, which + * conflicts with the hashmap-based iteration logic. Hence, instead of iterating through the + * dependencies and acting on them as we go, first take an "atomic snapshot" of sorts and iterate + * through that. */ + + n_others = unit_get_dependency_array(UNIT(t), UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, &others); + if (n_others < 0) + return n_others; + + for (int i = 0; i < n_others; i++) { + r = unit_add_default_target_dependency(others[i], UNIT(t)); + if (r < 0) + return r; + } + + if (unit_has_name(UNIT(t), SPECIAL_SHUTDOWN_TARGET)) + return 0; + + /* Make sure targets are unloaded on shutdown */ + return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int target_load(Unit *u) { + Target *t = TARGET(u); + int r; + + assert(t); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + return target_add_default_dependencies(t); +} + +static int target_coldplug(Unit *u) { + Target *t = TARGET(u); + + assert(t); + assert(t->state == TARGET_DEAD); + + if (t->deserialized_state != t->state) + target_set_state(t, t->deserialized_state); + + return 0; +} + +static void target_dump(Unit *u, FILE *f, const char *prefix) { + Target *t = TARGET(u); + + assert(t); + assert(f); + + fprintf(f, + "%sTarget State: %s\n", + prefix, target_state_to_string(t->state)); +} + +static int target_start(Unit *u) { + Target *t = TARGET(u); + int r; + + assert(t); + assert(t->state == TARGET_DEAD); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + target_set_state(t, TARGET_ACTIVE); + return 1; +} + +static int target_stop(Unit *u) { + Target *t = TARGET(u); + + assert(t); + assert(t->state == TARGET_ACTIVE); + + target_set_state(t, TARGET_DEAD); + return 1; +} + +static int target_serialize(Unit *u, FILE *f, FDSet *fds) { + Target *s = TARGET(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", target_state_to_string(s->state)); + return 0; +} + +static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Target *s = TARGET(u); + + assert(s); + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + TargetState state; + + state = target_state_from_string(value); + if (state < 0) + log_debug("Failed to parse state value %s", value); + else + s->deserialized_state = state; + + } else + log_debug("Unknown serialization key '%s'", key); + + return 0; +} + +static UnitActiveState target_active_state(Unit *u) { + assert(u); + + return state_translation_table[TARGET(u)->state]; +} + +static const char *target_sub_state_to_string(Unit *u) { + assert(u); + + return target_state_to_string(TARGET(u)->state); +} + +const UnitVTable target_vtable = { + .object_size = sizeof(Target), + + .sections = + "Unit\0" + "Target\0" + "Install\0", + + .can_fail = true, + + .load = target_load, + .coldplug = target_coldplug, + + .dump = target_dump, + + .start = target_start, + .stop = target_stop, + + .serialize = target_serialize, + .deserialize_item = target_deserialize_item, + + .active_state = target_active_state, + .sub_state_to_string = target_sub_state_to_string, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Reached target %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Stopped target %s.", + }, + }, +}; diff --git a/src/core/target.h b/src/core/target.h new file mode 100644 index 0000000..bb909d6 --- /dev/null +++ b/src/core/target.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +typedef struct Target Target; + +struct Target { + Unit meta; + + TargetState state, deserialized_state; +}; + +extern const UnitVTable target_vtable; + +DEFINE_CAST(TARGET, Target); diff --git a/src/core/timer.c b/src/core/timer.c new file mode 100644 index 0000000..3c41a25 --- /dev/null +++ b/src/core/timer.c @@ -0,0 +1,1106 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-timer.h" +#include "dbus-unit.h" +#include "fs-util.h" +#include "parse-util.h" +#include "random-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "timer.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" + +static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = { + [TIMER_DEAD] = UNIT_INACTIVE, + [TIMER_WAITING] = UNIT_ACTIVE, + [TIMER_RUNNING] = UNIT_ACTIVE, + [TIMER_ELAPSED] = UNIT_ACTIVE, + [TIMER_FAILED] = UNIT_FAILED +}; + +static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata); + +static void timer_init(Unit *u) { + Timer *t = TIMER(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + t->next_elapse_monotonic_or_boottime = USEC_INFINITY; + t->next_elapse_realtime = USEC_INFINITY; + t->accuracy_usec = u->manager->defaults.timer_accuracy_usec; + t->remain_after_elapse = true; +} + +void timer_free_values(Timer *t) { + TimerValue *v; + + assert(t); + + while ((v = LIST_POP(value, t->values))) { + calendar_spec_free(v->calendar_spec); + free(v); + } +} + +static void timer_done(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + + timer_free_values(t); + + t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source); + t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source); + + t->stamp_path = mfree(t->stamp_path); +} + +static int timer_verify(Timer *t) { + assert(t); + assert(UNIT(t)->load_state == UNIT_LOADED); + + if (!t->values && !t->on_clock_change && !t->on_timezone_change) + return log_unit_error_errno(UNIT(t), SYNTHETIC_ERRNO(ENOEXEC), "Timer unit lacks value setting. Refusing."); + + return 0; +} + +static int timer_add_default_dependencies(Timer *t) { + int r; + + assert(t); + + if (!UNIT(t)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(t), UNIT_BEFORE, SPECIAL_TIMERS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(t), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + if (v->base != TIMER_CALENDAR) + continue; + + FOREACH_STRING(target, SPECIAL_TIME_SYNC_TARGET, SPECIAL_TIME_SET_TARGET) { + r = unit_add_dependency_by_name(UNIT(t), UNIT_AFTER, target, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + break; + } + } + + return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int timer_add_trigger_dependencies(Timer *t) { + Unit *x; + int r; + + assert(t); + + if (UNIT_TRIGGER(UNIT(t))) + return 0; + + r = unit_load_related_unit(UNIT(t), ".service", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(t), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int timer_setup_persistent(Timer *t) { + _cleanup_free_ char *stamp_path = NULL; + int r; + + assert(t); + + if (!t->persistent) + return 0; + + if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { + + r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + stamp_path = strjoin("/var/lib/systemd/timers/stamp-", UNIT(t)->id); + } else { + const char *e; + + e = getenv("XDG_DATA_HOME"); + if (e) + stamp_path = strjoin(e, "/systemd/timers/stamp-", UNIT(t)->id); + else { + + _cleanup_free_ char *h = NULL; + + r = get_home_dir(&h); + if (r < 0) + return log_unit_error_errno(UNIT(t), r, "Failed to determine home directory: %m"); + + stamp_path = strjoin(h, "/.local/share/systemd/timers/stamp-", UNIT(t)->id); + } + } + + if (!stamp_path) + return log_oom(); + + return free_and_replace(t->stamp_path, stamp_path); +} + +static uint64_t timer_get_fixed_delay_hash(Timer *t) { + static const uint8_t hash_key[] = { + 0x51, 0x0a, 0xdb, 0x76, 0x29, 0x51, 0x42, 0xc2, + 0x80, 0x35, 0xea, 0xe6, 0x8e, 0x3a, 0x37, 0xbd + }; + + struct siphash state; + sd_id128_t machine_id; + uid_t uid; + int r; + + assert(t); + + uid = getuid(); + r = sd_id128_get_machine(&machine_id); + if (r < 0) { + log_unit_debug_errno(UNIT(t), r, + "Failed to get machine ID for the fixed delay calculation, proceeding with 0: %m"); + machine_id = SD_ID128_NULL; + } + + siphash24_init(&state, hash_key); + siphash24_compress(&machine_id, sizeof(sd_id128_t), &state); + siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state); + siphash24_compress(&uid, sizeof(uid_t), &state); + siphash24_compress_string(UNIT(t)->id, &state); + + return siphash24_finalize(&state); +} + +static int timer_load(Unit *u) { + Timer *t = TIMER(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = timer_add_trigger_dependencies(t); + if (r < 0) + return r; + + r = timer_setup_persistent(t); + if (r < 0) + return r; + + r = timer_add_default_dependencies(t); + if (r < 0) + return r; + + return timer_verify(t); +} + +static void timer_dump(Unit *u, FILE *f, const char *prefix) { + Timer *t = TIMER(u); + Unit *trigger; + + trigger = UNIT_TRIGGER(u); + + fprintf(f, + "%sTimer State: %s\n" + "%sResult: %s\n" + "%sUnit: %s\n" + "%sPersistent: %s\n" + "%sWakeSystem: %s\n" + "%sAccuracy: %s\n" + "%sRemainAfterElapse: %s\n" + "%sFixedRandomDelay: %s\n" + "%sOnClockChange: %s\n" + "%sOnTimeZoneChange: %s\n", + prefix, timer_state_to_string(t->state), + prefix, timer_result_to_string(t->result), + prefix, trigger ? trigger->id : "n/a", + prefix, yes_no(t->persistent), + prefix, yes_no(t->wake_system), + prefix, FORMAT_TIMESPAN(t->accuracy_usec, 1), + prefix, yes_no(t->remain_after_elapse), + prefix, yes_no(t->fixed_random_delay), + prefix, yes_no(t->on_clock_change), + prefix, yes_no(t->on_timezone_change)); + + LIST_FOREACH(value, v, t->values) + if (v->base == TIMER_CALENDAR) { + _cleanup_free_ char *p = NULL; + + (void) calendar_spec_to_string(v->calendar_spec, &p); + + fprintf(f, + "%s%s: %s\n", + prefix, + timer_base_to_string(v->base), + strna(p)); + } else + fprintf(f, + "%s%s: %s\n", + prefix, + timer_base_to_string(v->base), + FORMAT_TIMESPAN(v->value, 0)); +} + +static void timer_set_state(Timer *t, TimerState state) { + TimerState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != TIMER_WAITING) { + t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source); + t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source); + t->next_elapse_monotonic_or_boottime = USEC_INFINITY; + t->next_elapse_realtime = USEC_INFINITY; + } + + if (state != old_state) + log_unit_debug(UNIT(t), "Changed %s -> %s", timer_state_to_string(old_state), timer_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); +} + +static void timer_enter_waiting(Timer *t, bool time_change); + +static int timer_coldplug(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + assert(t->state == TIMER_DEAD); + + if (t->deserialized_state == t->state) + return 0; + + if (t->deserialized_state == TIMER_WAITING) + timer_enter_waiting(t, false); + else + timer_set_state(t, t->deserialized_state); + + return 0; +} + +static void timer_enter_dead(Timer *t, TimerResult f) { + assert(t); + + if (t->result == TIMER_SUCCESS) + t->result = f; + + unit_log_result(UNIT(t), t->result == TIMER_SUCCESS, timer_result_to_string(t->result)); + timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD); +} + +static void timer_enter_elapsed(Timer *t, bool leave_around) { + assert(t); + + /* If a unit is marked with RemainAfterElapse=yes we leave it + * around even after it elapsed once, so that starting it + * later again does not necessarily mean immediate + * retriggering. We unconditionally leave units with + * TIMER_UNIT_ACTIVE or TIMER_UNIT_INACTIVE triggers around, + * since they might be restarted automatically at any time + * later on. */ + + if (t->remain_after_elapse || leave_around) + timer_set_state(t, TIMER_ELAPSED); + else + timer_enter_dead(t, TIMER_SUCCESS); +} + +static void add_random(Timer *t, usec_t *v) { + usec_t add; + + assert(t); + assert(v); + + if (t->random_usec == 0) + return; + if (*v == USEC_INFINITY) + return; + + add = (t->fixed_random_delay ? timer_get_fixed_delay_hash(t) : random_u64()) % t->random_usec; + + if (*v + add < *v) /* overflow */ + *v = (usec_t) -2; /* Highest possible value, that is not USEC_INFINITY */ + else + *v += add; + + log_unit_debug(UNIT(t), "Adding %s random time.", FORMAT_TIMESPAN(add, 0)); +} + +static void timer_enter_waiting(Timer *t, bool time_change) { + bool found_monotonic = false, found_realtime = false; + bool leave_around = false; + triple_timestamp ts; + Unit *trigger; + int r; + + assert(t); + + trigger = UNIT_TRIGGER(UNIT(t)); + if (!trigger) { + log_unit_error(UNIT(t), "Unit to trigger vanished."); + goto fail; + } + + triple_timestamp_now(&ts); + t->next_elapse_monotonic_or_boottime = t->next_elapse_realtime = 0; + + LIST_FOREACH(value, v, t->values) { + if (v->disabled) + continue; + + if (v->base == TIMER_CALENDAR) { + usec_t b, rebased; + + /* If we know the last time this was + * triggered, schedule the job based relative + * to that. If we don't, just start from + * the activation time. */ + + if (dual_timestamp_is_set(&t->last_trigger)) + b = t->last_trigger.realtime; + else if (dual_timestamp_is_set(&UNIT(t)->inactive_exit_timestamp)) + b = UNIT(t)->inactive_exit_timestamp.realtime; + else + b = ts.realtime; + + r = calendar_spec_next_usec(v->calendar_spec, b, &v->next_elapse); + if (r < 0) + continue; + + /* To make the delay due to RandomizedDelaySec= work even at boot, if the scheduled + * time has already passed, set the time when systemd first started as the scheduled + * time. Note that we base this on the monotonic timestamp of the boot, not the + * realtime one, since the wallclock might have been off during boot. */ + rebased = map_clock_usec(UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic, + CLOCK_MONOTONIC, CLOCK_REALTIME); + if (v->next_elapse < rebased) + v->next_elapse = rebased; + + if (!found_realtime) + t->next_elapse_realtime = v->next_elapse; + else + t->next_elapse_realtime = MIN(t->next_elapse_realtime, v->next_elapse); + + found_realtime = true; + + } else { + usec_t base; + + switch (v->base) { + + case TIMER_ACTIVE: + if (state_translation_table[t->state] == UNIT_ACTIVE) + base = UNIT(t)->inactive_exit_timestamp.monotonic; + else + base = ts.monotonic; + break; + + case TIMER_BOOT: + if (detect_container() <= 0) { + /* CLOCK_MONOTONIC equals the uptime on Linux */ + base = 0; + break; + } + /* In a container we don't want to include the time the host + * was already up when the container started, so count from + * our own startup. */ + _fallthrough_; + case TIMER_STARTUP: + base = UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + break; + + case TIMER_UNIT_ACTIVE: + leave_around = true; + base = MAX(trigger->inactive_exit_timestamp.monotonic, t->last_trigger.monotonic); + if (base <= 0) + continue; + break; + + case TIMER_UNIT_INACTIVE: + leave_around = true; + base = MAX(trigger->inactive_enter_timestamp.monotonic, t->last_trigger.monotonic); + if (base <= 0) + continue; + break; + + default: + assert_not_reached(); + } + + v->next_elapse = usec_add(usec_shift_clock(base, CLOCK_MONOTONIC, TIMER_MONOTONIC_CLOCK(t)), v->value); + + if (dual_timestamp_is_set(&t->last_trigger) && + !time_change && + v->next_elapse < triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)) && + IN_SET(v->base, TIMER_ACTIVE, TIMER_BOOT, TIMER_STARTUP)) { + /* This is a one time trigger, disable it now */ + v->disabled = true; + continue; + } + + if (!found_monotonic) + t->next_elapse_monotonic_or_boottime = v->next_elapse; + else + t->next_elapse_monotonic_or_boottime = MIN(t->next_elapse_monotonic_or_boottime, v->next_elapse); + + found_monotonic = true; + } + } + + if (!found_monotonic && !found_realtime && !t->on_timezone_change && !t->on_clock_change) { + log_unit_debug(UNIT(t), "Timer is elapsed."); + timer_enter_elapsed(t, leave_around); + return; + } + + if (found_monotonic) { + usec_t left; + + add_random(t, &t->next_elapse_monotonic_or_boottime); + + left = usec_sub_unsigned(t->next_elapse_monotonic_or_boottime, triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t))); + log_unit_debug(UNIT(t), "Monotonic timer elapses in %s.", FORMAT_TIMESPAN(left, 0)); + + if (t->monotonic_event_source) { + r = sd_event_source_set_time(t->monotonic_event_source, t->next_elapse_monotonic_or_boottime); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to reschedule monotonic event source: %m"); + goto fail; + } + + r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_ONESHOT); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to enable monotonic event source: %m"); + goto fail; + } + } else { + + r = sd_event_add_time( + UNIT(t)->manager->event, + &t->monotonic_event_source, + t->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC, + t->next_elapse_monotonic_or_boottime, t->accuracy_usec, + timer_dispatch, t); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to add monotonic event source: %m"); + goto fail; + } + + (void) sd_event_source_set_description(t->monotonic_event_source, "timer-monotonic"); + } + + } else if (t->monotonic_event_source) { + + r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_OFF); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to disable monotonic event source: %m"); + goto fail; + } + } + + if (found_realtime) { + add_random(t, &t->next_elapse_realtime); + + log_unit_debug(UNIT(t), "Realtime timer elapses at %s.", FORMAT_TIMESTAMP(t->next_elapse_realtime)); + + if (t->realtime_event_source) { + r = sd_event_source_set_time(t->realtime_event_source, t->next_elapse_realtime); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to reschedule realtime event source: %m"); + goto fail; + } + + r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_ONESHOT); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to enable realtime event source: %m"); + goto fail; + } + } else { + r = sd_event_add_time( + UNIT(t)->manager->event, + &t->realtime_event_source, + t->wake_system ? CLOCK_REALTIME_ALARM : CLOCK_REALTIME, + t->next_elapse_realtime, t->accuracy_usec, + timer_dispatch, t); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to add realtime event source: %m"); + goto fail; + } + + (void) sd_event_source_set_description(t->realtime_event_source, "timer-realtime"); + } + + } else if (t->realtime_event_source) { + + r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_OFF); + if (r < 0) { + log_unit_warning_errno(UNIT(t), r, "Failed to disable realtime event source: %m"); + goto fail; + } + } + + timer_set_state(t, TIMER_WAITING); + return; + +fail: + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); +} + +static void timer_enter_running(Timer *t) { + _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + Job *job; + int r; + + assert(t); + + /* Don't start job if we are supposed to go down */ + if (unit_stop_pending(UNIT(t))) + return; + + trigger = UNIT_TRIGGER(UNIT(t)); + if (!trigger) { + log_unit_error(UNIT(t), "Unit to trigger vanished."); + goto fail; + } + + details = activation_details_new(UNIT(t)); + if (!details) { + log_oom(); + goto fail; + } + + r = manager_add_job(UNIT(t)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job); + if (r < 0) { + log_unit_warning(UNIT(t), "Failed to queue unit startup job: %s", bus_error_message(&error, r)); + goto fail; + } + + dual_timestamp_now(&t->last_trigger); + ACTIVATION_DETAILS_TIMER(details)->last_trigger = t->last_trigger; + + job_set_activation_details(job, details); + + if (t->stamp_path) + touch_file(t->stamp_path, true, t->last_trigger.realtime, UID_INVALID, GID_INVALID, MODE_INVALID); + + timer_set_state(t, TIMER_RUNNING); + return; + +fail: + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); +} + +static int timer_start(Unit *u) { + Timer *t = TIMER(u); + int r; + + assert(t); + assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED)); + + r = unit_test_trigger_loaded(u); + if (r < 0) + return r; + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + t->last_trigger = DUAL_TIMESTAMP_NULL; + + /* Reenable all timers that depend on unit activation time */ + LIST_FOREACH(value, v, t->values) + if (v->base == TIMER_ACTIVE) + v->disabled = false; + + if (t->stamp_path) { + struct stat st; + + if (stat(t->stamp_path, &st) >= 0) { + usec_t ft; + + /* Load the file timestamp, but only if it is actually in the past. If it is in the future, + * something is wrong with the system clock. */ + + ft = timespec_load(&st.st_mtim); + if (ft < now(CLOCK_REALTIME)) + t->last_trigger.realtime = ft; + else + log_unit_warning(u, "Not using persistent file timestamp %s as it is in the future.", + FORMAT_TIMESTAMP(ft)); + + } else if (errno == ENOENT) + /* The timer has never run before, make sure a stamp file exists. */ + (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); + } + + t->result = TIMER_SUCCESS; + timer_enter_waiting(t, false); + return 1; +} + +static int timer_stop(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED)); + + timer_enter_dead(t, TIMER_SUCCESS); + return 1; +} + +static int timer_serialize(Unit *u, FILE *f, FDSet *fds) { + Timer *t = TIMER(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", timer_state_to_string(t->state)); + (void) serialize_item(f, "result", timer_result_to_string(t->result)); + + if (dual_timestamp_is_set(&t->last_trigger)) + (void) serialize_usec(f, "last-trigger-realtime", t->last_trigger.realtime); + + if (t->last_trigger.monotonic > 0) + (void) serialize_usec(f, "last-trigger-monotonic", t->last_trigger.monotonic); + + return 0; +} + +static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Timer *t = TIMER(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + TimerState state; + + state = timer_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + t->deserialized_state = state; + + } else if (streq(key, "result")) { + TimerResult f; + + f = timer_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != TIMER_SUCCESS) + t->result = f; + + } else if (streq(key, "last-trigger-realtime")) + (void) deserialize_usec(value, &t->last_trigger.realtime); + else if (streq(key, "last-trigger-monotonic")) + (void) deserialize_usec(value, &t->last_trigger.monotonic); + else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState timer_active_state(Unit *u) { + assert(u); + + return state_translation_table[TIMER(u)->state]; +} + +static const char *timer_sub_state_to_string(Unit *u) { + assert(u); + + return timer_state_to_string(TIMER(u)->state); +} + +static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) { + Timer *t = TIMER(userdata); + + assert(t); + + if (t->state != TIMER_WAITING) + return 0; + + log_unit_debug(UNIT(t), "Timer elapsed."); + timer_enter_running(t); + return 0; +} + +static void timer_trigger_notify(Unit *u, Unit *other) { + Timer *t = TIMER(u); + + assert(u); + assert(other); + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + + /* Reenable all timers that depend on unit state */ + LIST_FOREACH(value, v, t->values) + if (IN_SET(v->base, TIMER_UNIT_ACTIVE, TIMER_UNIT_INACTIVE)) + v->disabled = false; + + switch (t->state) { + + case TIMER_WAITING: + case TIMER_ELAPSED: + + /* Recalculate sleep time */ + timer_enter_waiting(t, false); + break; + + case TIMER_RUNNING: + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + log_unit_debug(UNIT(t), "Got notified about unit deactivation."); + timer_enter_waiting(t, false); + } + break; + + case TIMER_DEAD: + case TIMER_FAILED: + break; + + default: + assert_not_reached(); + } +} + +static void timer_reset_failed(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + + if (t->state == TIMER_FAILED) + timer_set_state(t, TIMER_DEAD); + + t->result = TIMER_SUCCESS; +} + +static void timer_time_change(Unit *u) { + Timer *t = TIMER(u); + usec_t ts; + + assert(u); + + if (t->state != TIMER_WAITING) + return; + + /* If we appear to have triggered in the future, the system clock must + * have been set backwards. So let's rewind our own clock and allow + * the future triggers to happen again :). Exactly the same as when + * you start a timer unit with Persistent=yes. */ + ts = now(CLOCK_REALTIME); + if (t->last_trigger.realtime > ts) + t->last_trigger.realtime = ts; + + if (t->on_clock_change) { + log_unit_debug(u, "Time change, triggering activation."); + timer_enter_running(t); + } else { + log_unit_debug(u, "Time change, recalculating next elapse."); + timer_enter_waiting(t, true); + } +} + +static void timer_timezone_change(Unit *u) { + Timer *t = TIMER(u); + + assert(u); + + if (t->state != TIMER_WAITING) + return; + + if (t->on_timezone_change) { + log_unit_debug(u, "Timezone change, triggering activation."); + timer_enter_running(t); + } else { + log_unit_debug(u, "Timezone change, recalculating next elapse."); + timer_enter_waiting(t, false); + } +} + +static int timer_clean(Unit *u, ExecCleanMask mask) { + Timer *t = TIMER(u); + int r; + + assert(t); + assert(mask != 0); + + if (t->state != TIMER_DEAD) + return -EBUSY; + + if (mask != EXEC_CLEAN_STATE) + return -EUNATCH; + + r = timer_setup_persistent(t); + if (r < 0) + return r; + + if (!t->stamp_path) + return -EUNATCH; + + if (unlink(t->stamp_path) && errno != ENOENT) + return log_unit_error_errno(u, errno, "Failed to clean stamp file of timer: %m"); + + return 0; +} + +static int timer_can_clean(Unit *u, ExecCleanMask *ret) { + Timer *t = TIMER(u); + + assert(t); + assert(ret); + + *ret = t->persistent ? EXEC_CLEAN_STATE : 0; + return 0; +} + +static int timer_can_start(Unit *u) { + Timer *t = TIMER(u); + int r; + + assert(t); + + r = unit_test_start_limit(u); + if (r < 0) { + timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT); + return r; + } + + return 1; +} + +static void activation_details_timer_serialize(ActivationDetails *details, FILE *f) { + ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details); + + assert(details); + assert(f); + assert(t); + + (void) serialize_dual_timestamp(f, "activation-details-timer-last-trigger", &t->last_trigger); +} + +static int activation_details_timer_deserialize(const char *key, const char *value, ActivationDetails **details) { + int r; + + assert(key); + assert(value); + + if (!details || !*details) + return -EINVAL; + + ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(*details); + if (!t) + return -EINVAL; + + if (!streq(key, "activation-details-timer-last-trigger")) + return -EINVAL; + + r = deserialize_dual_timestamp(value, &t->last_trigger); + if (r < 0) + return r; + + return 0; +} + +static int activation_details_timer_append_env(ActivationDetails *details, char ***strv) { + ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details); + int r; + + assert(details); + assert(strv); + assert(t); + + if (!dual_timestamp_is_set(&t->last_trigger)) + return 0; + + r = strv_extendf(strv, "TRIGGER_TIMER_REALTIME_USEC=" USEC_FMT, t->last_trigger.realtime); + if (r < 0) + return r; + + r = strv_extendf(strv, "TRIGGER_TIMER_MONOTONIC_USEC=" USEC_FMT, t->last_trigger.monotonic); + if (r < 0) + return r; + + return 2; /* Return the number of variables added to the env block */ +} + +static int activation_details_timer_append_pair(ActivationDetails *details, char ***strv) { + ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details); + int r; + + assert(details); + assert(strv); + assert(t); + + if (!dual_timestamp_is_set(&t->last_trigger)) + return 0; + + r = strv_extend(strv, "trigger_timer_realtime_usec"); + if (r < 0) + return r; + + r = strv_extendf(strv, USEC_FMT, t->last_trigger.realtime); + if (r < 0) + return r; + + r = strv_extend(strv, "trigger_timer_monotonic_usec"); + if (r < 0) + return r; + + r = strv_extendf(strv, USEC_FMT, t->last_trigger.monotonic); + if (r < 0) + return r; + + return 2; /* Return the number of pairs added to the env block */ +} + +uint64_t timer_next_elapse_monotonic(const Timer *t) { + assert(t); + + return (uint64_t) usec_shift_clock(t->next_elapse_monotonic_or_boottime, + TIMER_MONOTONIC_CLOCK(t), CLOCK_MONOTONIC); +} + +static const char* const timer_base_table[_TIMER_BASE_MAX] = { + [TIMER_ACTIVE] = "OnActiveSec", + [TIMER_BOOT] = "OnBootSec", + [TIMER_STARTUP] = "OnStartupSec", + [TIMER_UNIT_ACTIVE] = "OnUnitActiveSec", + [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec", + [TIMER_CALENDAR] = "OnCalendar" +}; + +DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase); + +char* timer_base_to_usec_string(TimerBase i) { + _cleanup_free_ char *buf = NULL; + const char *s; + size_t l; + + s = timer_base_to_string(i); + + if (endswith(s, "Sec")) { + /* s/Sec/USec/ */ + l = strlen(s); + buf = new(char, l+2); + if (!buf) + return NULL; + + memcpy(buf, s, l-3); + memcpy(buf+l-3, "USec", 5); + } else { + buf = strdup(s); + if (!buf) + return NULL; + } + + return TAKE_PTR(buf); +} + +static const char* const timer_result_table[_TIMER_RESULT_MAX] = { + [TIMER_SUCCESS] = "success", + [TIMER_FAILURE_RESOURCES] = "resources", + [TIMER_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(timer_result, TimerResult); + +const UnitVTable timer_vtable = { + .object_size = sizeof(Timer), + + .sections = + "Unit\0" + "Timer\0" + "Install\0", + .private_section = "Timer", + + .can_transient = true, + .can_fail = true, + .can_trigger = true, + + .init = timer_init, + .done = timer_done, + .load = timer_load, + + .coldplug = timer_coldplug, + + .dump = timer_dump, + + .start = timer_start, + .stop = timer_stop, + + .clean = timer_clean, + .can_clean = timer_can_clean, + + .serialize = timer_serialize, + .deserialize_item = timer_deserialize_item, + + .active_state = timer_active_state, + .sub_state_to_string = timer_sub_state_to_string, + + .trigger_notify = timer_trigger_notify, + + .reset_failed = timer_reset_failed, + .time_change = timer_time_change, + .timezone_change = timer_timezone_change, + + .bus_set_property = bus_timer_set_property, + + .can_start = timer_can_start, +}; + +const ActivationDetailsVTable activation_details_timer_vtable = { + .object_size = sizeof(ActivationDetailsTimer), + + .serialize = activation_details_timer_serialize, + .deserialize = activation_details_timer_deserialize, + .append_env = activation_details_timer_append_env, + .append_pair = activation_details_timer_append_pair, +}; diff --git a/src/core/timer.h b/src/core/timer.h new file mode 100644 index 0000000..76d45b2 --- /dev/null +++ b/src/core/timer.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Timer Timer; +typedef struct ActivationDetailsTimer ActivationDetailsTimer; + +#include "calendarspec.h" +#include "unit.h" + +typedef enum TimerBase { + TIMER_ACTIVE, + TIMER_BOOT, + TIMER_STARTUP, + TIMER_UNIT_ACTIVE, + TIMER_UNIT_INACTIVE, + TIMER_CALENDAR, + _TIMER_BASE_MAX, + _TIMER_BASE_INVALID = -EINVAL, +} TimerBase; + +typedef struct TimerValue { + TimerBase base; + bool disabled; + + usec_t value; /* only for monotonic events */ + CalendarSpec *calendar_spec; /* only for calendar events */ + usec_t next_elapse; + + LIST_FIELDS(struct TimerValue, value); +} TimerValue; + +typedef enum TimerResult { + TIMER_SUCCESS, + TIMER_FAILURE_RESOURCES, + TIMER_FAILURE_START_LIMIT_HIT, + _TIMER_RESULT_MAX, + _TIMER_RESULT_INVALID = -EINVAL, +} TimerResult; + +struct Timer { + Unit meta; + + usec_t accuracy_usec; + usec_t random_usec; + + LIST_HEAD(TimerValue, values); + usec_t next_elapse_realtime; + usec_t next_elapse_monotonic_or_boottime; + dual_timestamp last_trigger; + + TimerState state, deserialized_state; + + sd_event_source *monotonic_event_source; + sd_event_source *realtime_event_source; + + TimerResult result; + + bool persistent; + bool wake_system; + bool remain_after_elapse; + bool on_clock_change; + bool on_timezone_change; + bool fixed_random_delay; + + char *stamp_path; +}; + +struct ActivationDetailsTimer { + ActivationDetails meta; + dual_timestamp last_trigger; +}; + +#define TIMER_MONOTONIC_CLOCK(t) ((t)->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC) + +uint64_t timer_next_elapse_monotonic(const Timer *t); + +void timer_free_values(Timer *t); + +extern const UnitVTable timer_vtable; +extern const ActivationDetailsVTable activation_details_timer_vtable; + +const char *timer_base_to_string(TimerBase i) _const_; +TimerBase timer_base_from_string(const char *s) _pure_; + +char* timer_base_to_usec_string(TimerBase i); + +const char* timer_result_to_string(TimerResult i) _const_; +TimerResult timer_result_from_string(const char *s) _pure_; + +DEFINE_CAST(TIMER, Timer); +DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_TIMER, ActivationDetailsTimer, TIMER); diff --git a/src/core/transaction.c b/src/core/transaction.c new file mode 100644 index 0000000..a81c40f --- /dev/null +++ b/src/core/transaction.c @@ -0,0 +1,1261 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "dbus-unit.h" +#include "strv.h" +#include "terminal-util.h" +#include "transaction.h" + +static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies); + +static void transaction_delete_job(Transaction *tr, Job *j, bool delete_dependencies) { + assert(tr); + assert(j); + + /* Deletes one job from the transaction */ + + transaction_unlink_job(tr, j, delete_dependencies); + + job_free(j); +} + +static void transaction_delete_unit(Transaction *tr, Unit *u) { + Job *j; + + /* Deletes all jobs associated with a certain unit from the + * transaction */ + + while ((j = hashmap_get(tr->jobs, u))) + transaction_delete_job(tr, j, true); +} + +static void transaction_abort(Transaction *tr) { + Job *j; + + assert(tr); + + while ((j = hashmap_first(tr->jobs))) + transaction_delete_job(tr, j, false); + + assert(hashmap_isempty(tr->jobs)); +} + +static void transaction_find_jobs_that_matter_to_anchor(Job *j, unsigned generation) { + assert(j); + + /* A recursive sweep through the graph that marks all units + * that matter to the anchor job, i.e. are directly or + * indirectly a dependency of the anchor job via paths that + * are fully marked as mattering. */ + + j->matters_to_anchor = true; + j->generation = generation; + + LIST_FOREACH(subject, l, j->subject_list) { + + /* This link does not matter */ + if (!l->matters) + continue; + + /* This unit has already been marked */ + if (l->object->generation == generation) + continue; + + transaction_find_jobs_that_matter_to_anchor(l->object, generation); + } +} + +static void transaction_merge_and_delete_job(Transaction *tr, Job *j, Job *other, JobType t) { + JobDependency *last; + + assert(j); + assert(other); + assert(j->unit == other->unit); + assert(!j->installed); + + /* Merges 'other' into 'j' and then deletes 'other'. */ + + j->type = t; + j->state = JOB_WAITING; + j->irreversible = j->irreversible || other->irreversible; + j->matters_to_anchor = j->matters_to_anchor || other->matters_to_anchor; + + /* Patch us in as new owner of the JobDependency objects */ + last = NULL; + LIST_FOREACH(subject, l, other->subject_list) { + assert(l->subject == other); + l->subject = j; + last = l; + } + + /* Merge both lists */ + if (last) { + last->subject_next = j->subject_list; + if (j->subject_list) + j->subject_list->subject_prev = last; + j->subject_list = other->subject_list; + } + + /* Patch us in as new owner of the JobDependency objects */ + last = NULL; + LIST_FOREACH(object, l, other->object_list) { + assert(l->object == other); + l->object = j; + last = l; + } + + /* Merge both lists */ + if (last) { + last->object_next = j->object_list; + if (j->object_list) + j->object_list->object_prev = last; + j->object_list = other->object_list; + } + + /* Kill the other job */ + other->subject_list = NULL; + other->object_list = NULL; + transaction_delete_job(tr, other, true); +} + +static bool job_is_conflicted_by(Job *j) { + assert(j); + + /* Returns true if this job is pulled in by a least one + * ConflictedBy dependency. */ + + LIST_FOREACH(object, l, j->object_list) + if (l->conflicts) + return true; + + return false; +} + +static int delete_one_unmergeable_job(Transaction *tr, Job *job) { + assert(job); + + /* Tries to delete one item in the linked list + * j->transaction_next->transaction_next->... that conflicts + * with another one, in an attempt to make an inconsistent + * transaction work. */ + + /* We rely here on the fact that if a merged with b does not + * merge with c, either a or b merge with c neither */ + LIST_FOREACH(transaction, j, job) + LIST_FOREACH(transaction, k, j->transaction_next) { + Job *d; + + /* Is this one mergeable? Then skip it */ + if (job_type_is_mergeable(j->type, k->type)) + continue; + + /* Ok, we found two that conflict, let's see if we can + * drop one of them */ + if (!j->matters_to_anchor && !k->matters_to_anchor) { + + /* Both jobs don't matter, so let's + * find the one that is smarter to + * remove. Let's think positive and + * rather remove stops then starts -- + * except if something is being + * stopped because it is conflicted by + * another unit in which case we + * rather remove the start. */ + + log_unit_debug(j->unit, + "Looking at job %s/%s conflicted_by=%s", + j->unit->id, job_type_to_string(j->type), + yes_no(j->type == JOB_STOP && job_is_conflicted_by(j))); + log_unit_debug(k->unit, + "Looking at job %s/%s conflicted_by=%s", + k->unit->id, job_type_to_string(k->type), + yes_no(k->type == JOB_STOP && job_is_conflicted_by(k))); + + if (j->type == JOB_STOP) { + + if (job_is_conflicted_by(j)) + d = k; + else + d = j; + + } else if (k->type == JOB_STOP) { + + if (job_is_conflicted_by(k)) + d = j; + else + d = k; + } else + d = j; + + } else if (!j->matters_to_anchor) + d = j; + else if (!k->matters_to_anchor) + d = k; + else + return -ENOEXEC; + + /* Ok, we can drop one, so let's do so. */ + log_unit_debug(d->unit, + "Fixing conflicting jobs %s/%s,%s/%s by deleting job %s/%s", + j->unit->id, job_type_to_string(j->type), + k->unit->id, job_type_to_string(k->type), + d->unit->id, job_type_to_string(d->type)); + transaction_delete_job(tr, d, true); + return 0; + } + + return -EINVAL; +} + +static int transaction_merge_jobs(Transaction *tr, sd_bus_error *e) { + Job *j; + int r; + + assert(tr); + + /* First step, check whether any of the jobs for one specific + * task conflict. If so, try to drop one of them. */ + HASHMAP_FOREACH(j, tr->jobs) { + JobType t; + + t = j->type; + LIST_FOREACH(transaction, k, j->transaction_next) { + if (job_type_merge_and_collapse(&t, k->type, j->unit) >= 0) + continue; + + /* OK, we could not merge all jobs for this + * action. Let's see if we can get rid of one + * of them */ + + r = delete_one_unmergeable_job(tr, j); + if (r >= 0) + /* Ok, we managed to drop one, now + * let's ask our callers to call us + * again after garbage collecting */ + return -EAGAIN; + + /* We couldn't merge anything. Failure */ + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_JOBS_CONFLICTING, + "Transaction contains conflicting jobs '%s' and '%s' for %s. " + "Probably contradicting requirement dependencies configured.", + job_type_to_string(t), + job_type_to_string(k->type), + k->unit->id); + } + } + + /* Second step, merge the jobs. */ + HASHMAP_FOREACH(j, tr->jobs) { + JobType t = j->type; + + /* Merge all transaction jobs for j->unit */ + LIST_FOREACH(transaction, k, j->transaction_next) + assert_se(job_type_merge_and_collapse(&t, k->type, j->unit) == 0); + + Job *k; + while ((k = j->transaction_next)) { + if (tr->anchor_job == k) { + transaction_merge_and_delete_job(tr, k, j, t); + j = k; + } else + transaction_merge_and_delete_job(tr, j, k, t); + } + + assert(!j->transaction_next); + assert(!j->transaction_prev); + } + + return 0; +} + +static void transaction_drop_redundant(Transaction *tr) { + bool again; + + /* Goes through the transaction and removes all jobs of the units whose jobs are all noops. If not + * all of a unit's jobs are redundant, they are kept. */ + + assert(tr); + + do { + Job *j; + + again = false; + + HASHMAP_FOREACH(j, tr->jobs) { + bool keep = false; + + LIST_FOREACH(transaction, k, j) + if (tr->anchor_job == k || + !job_type_is_redundant(k->type, unit_active_state(k->unit)) || + (k->unit->job && job_type_is_conflicting(k->type, k->unit->job->type))) { + keep = true; + break; + } + + if (!keep) { + log_trace("Found redundant job %s/%s, dropping from transaction.", + j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, j, false); + again = true; + break; + } + } + } while (again); +} + +static bool job_matters_to_anchor(Job *job) { + assert(job); + assert(!job->transaction_prev); + + /* Checks whether at least one of the jobs for this transaction matters to the anchor. */ + + LIST_FOREACH(transaction, j, job) + if (j->matters_to_anchor) + return true; + + return false; +} + +static char* merge_unit_ids(const char* unit_log_field, char * const* pairs) { + _cleanup_free_ char *ans = NULL; + size_t size = 0; + + assert(unit_log_field); + + STRV_FOREACH_PAIR(unit_id, job_type, pairs) { + size_t next; + + if (size > 0) + ans[size - 1] = '\n'; + + next = strlen(unit_log_field) + strlen(*unit_id); + if (!GREEDY_REALLOC(ans, size + next + 1)) + return NULL; + + sprintf(ans + size, "%s%s", unit_log_field, *unit_id); + size += next + 1; + } + + if (!ans) + return strdup(""); + + return TAKE_PTR(ans); +} + +static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsigned generation, sd_bus_error *e) { + + static const UnitDependencyAtom directions[] = { + UNIT_ATOM_BEFORE, + UNIT_ATOM_AFTER, + }; + + int r; + + assert(tr); + assert(j); + assert(!j->transaction_prev); + + /* Does a recursive sweep through the ordering graph, looking for a cycle. If we find a cycle we try + * to break it. */ + + /* Have we seen this before? */ + if (j->generation == generation) { + Job *k, *delete = NULL; + _cleanup_free_ char **array = NULL, *unit_ids = NULL; + + /* If the marker is NULL we have been here already and decided the job was loop-free from + * here. Hence shortcut things and return right-away. */ + if (!j->marker) + return 0; + + /* So, the marker is not NULL and we already have been here. We have a cycle. Let's try to + * break it. We go backwards in our path and try to find a suitable job to remove. We use the + * marker to find our way back, since smart how we are we stored our way back in there. */ + for (k = from; k; k = ((k->generation == generation && k->marker != k) ? k->marker : NULL)) { + + /* For logging below */ + if (strv_push_pair(&array, k->unit->id, (char*) job_type_to_string(k->type)) < 0) + log_oom(); + + if (!delete && hashmap_contains(tr->jobs, k->unit) && !job_matters_to_anchor(k)) + /* Ok, we can drop this one, so let's do so. */ + delete = k; + + /* Check if this in fact was the beginning of the cycle */ + if (k == j) + break; + } + + unit_ids = merge_unit_ids(j->manager->unit_log_field, array); /* ignore error */ + + STRV_FOREACH_PAIR(unit_id, job_type, array) + /* logging for j not k here to provide a consistent narrative */ + log_struct(LOG_WARNING, + LOG_UNIT_MESSAGE(j->unit, + "Found %s on %s/%s", + unit_id == array ? "ordering cycle" : "dependency", + *unit_id, *job_type), + "%s", strna(unit_ids)); + + if (delete) { + const char *status; + /* logging for j not k here to provide a consistent narrative */ + log_struct(LOG_ERR, + LOG_UNIT_MESSAGE(j->unit, + "Job %s/%s deleted to break ordering cycle starting with %s/%s", + delete->unit->id, job_type_to_string(delete->type), + j->unit->id, job_type_to_string(j->type)), + "%s", strna(unit_ids)); + + if (log_get_show_color()) + status = ANSI_HIGHLIGHT_RED " SKIP " ANSI_NORMAL; + else + status = " SKIP "; + + unit_status_printf(delete->unit, + STATUS_TYPE_NOTICE, + status, + "Ordering cycle found, skipping %s", + unit_status_string(delete->unit, NULL)); + transaction_delete_unit(tr, delete->unit); + return -EAGAIN; + } + + log_struct(LOG_ERR, + LOG_UNIT_MESSAGE(j->unit, "Unable to break cycle starting with %s/%s", + j->unit->id, job_type_to_string(j->type)), + "%s", strna(unit_ids)); + + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC, + "Transaction order is cyclic. See system logs for details."); + } + + /* Make the marker point to where we come from, so that we can + * find our way backwards if we want to break a cycle. We use + * a special marker for the beginning: we point to + * ourselves. */ + j->marker = from ?: j; + j->generation = generation; + + /* Actual ordering of jobs depends on the unit ordering dependency and job types. We need to traverse + * the graph over 'before' edges in the actual job execution order. We traverse over both unit + * ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job + * execution ordering. */ + for (size_t d = 0; d < ELEMENTSOF(directions); d++) { + Unit *u; + + UNIT_FOREACH_DEPENDENCY(u, j->unit, directions[d]) { + Job *o; + + /* Is there a job for this unit? */ + o = hashmap_get(tr->jobs, u); + if (!o) { + /* Ok, there is no job for this in the transaction, but maybe there is + * already one running? */ + o = u->job; + if (!o) + continue; + } + + /* Cut traversing if the job j is not really *before* o. */ + if (job_compare(j, o, directions[d]) >= 0) + continue; + + r = transaction_verify_order_one(tr, o, j, generation, e); + if (r < 0) + return r; + } + } + + /* Ok, let's backtrack, and remember that this entry is not on + * our path anymore. */ + j->marker = NULL; + + return 0; +} + +static int transaction_verify_order(Transaction *tr, unsigned *generation, sd_bus_error *e) { + Job *j; + int r; + unsigned g; + + assert(tr); + assert(generation); + + /* Check if the ordering graph is cyclic. If it is, try to fix + * that up by dropping one of the jobs. */ + + g = (*generation)++; + + HASHMAP_FOREACH(j, tr->jobs) { + r = transaction_verify_order_one(tr, j, NULL, g, e); + if (r < 0) + return r; + } + + return 0; +} + +static void transaction_collect_garbage(Transaction *tr) { + bool again; + + assert(tr); + + /* Drop jobs that are not required by any other job */ + + do { + Job *j; + + again = false; + + HASHMAP_FOREACH(j, tr->jobs) { + if (tr->anchor_job == j) + continue; + + if (!j->object_list) { + log_trace("Garbage collecting job %s/%s", j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, j, true); + again = true; + break; + } + + log_trace("Keeping job %s/%s because of %s/%s", + j->unit->id, job_type_to_string(j->type), + j->object_list->subject ? j->object_list->subject->unit->id : "root", + j->object_list->subject ? job_type_to_string(j->object_list->subject->type) : "root"); + } + + } while (again); +} + +static int transaction_is_destructive(Transaction *tr, JobMode mode, sd_bus_error *e) { + Job *j; + + assert(tr); + + /* Checks whether applying this transaction means that + * existing jobs would be replaced */ + + HASHMAP_FOREACH(j, tr->jobs) { + + /* Assume merged */ + assert(!j->transaction_prev); + assert(!j->transaction_next); + + if (j->unit->job && (mode == JOB_FAIL || j->unit->job->irreversible) && + job_type_is_conflicting(j->unit->job->type, j->type)) + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE, + "Transaction for %s/%s is destructive (%s has '%s' job queued, but '%s' is included in transaction).", + tr->anchor_job->unit->id, job_type_to_string(tr->anchor_job->type), + j->unit->id, job_type_to_string(j->unit->job->type), job_type_to_string(j->type)); + } + + return 0; +} + +static void transaction_minimize_impact(Transaction *tr) { + Job *head; + + assert(tr); + + /* Drops all unnecessary jobs that reverse already active jobs + * or that stop a running service. */ + +rescan: + HASHMAP_FOREACH(head, tr->jobs) { + LIST_FOREACH(transaction, j, head) { + bool stops_running_service, changes_existing_job; + + /* If it matters, we shouldn't drop it */ + if (j->matters_to_anchor) + continue; + + /* Would this stop a running service? + * Would this change an existing job? + * If so, let's drop this entry */ + + stops_running_service = + j->type == JOB_STOP && UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(j->unit)); + + changes_existing_job = + j->unit->job && + job_type_is_conflicting(j->type, j->unit->job->type); + + if (!stops_running_service && !changes_existing_job) + continue; + + if (stops_running_service) + log_unit_debug(j->unit, + "%s/%s would stop a running service.", + j->unit->id, job_type_to_string(j->type)); + + if (changes_existing_job) + log_unit_debug(j->unit, + "%s/%s would change existing job.", + j->unit->id, job_type_to_string(j->type)); + + /* Ok, let's get rid of this */ + log_unit_debug(j->unit, + "Deleting %s/%s to minimize impact.", + j->unit->id, job_type_to_string(j->type)); + + transaction_delete_job(tr, j, true); + goto rescan; + } + } +} + +static int transaction_apply( + Transaction *tr, + Manager *m, + JobMode mode, + Set *affected_jobs) { + + Job *j; + int r; + + /* Moves the transaction jobs to the set of active jobs */ + + if (IN_SET(mode, JOB_ISOLATE, JOB_FLUSH)) { + + /* When isolating first kill all installed jobs which + * aren't part of the new transaction */ + HASHMAP_FOREACH(j, m->jobs) { + assert(j->installed); + + if (j->unit->ignore_on_isolate) + continue; + + if (hashmap_contains(tr->jobs, j->unit)) + continue; + + /* Not invalidating recursively. Avoids triggering + * OnFailure= actions of dependent jobs. Also avoids + * invalidating our iterator. */ + job_finish_and_invalidate(j, JOB_CANCELED, false, false); + } + } + + HASHMAP_FOREACH(j, tr->jobs) { + /* Assume merged */ + assert(!j->transaction_prev); + assert(!j->transaction_next); + + r = hashmap_ensure_put(&m->jobs, NULL, UINT32_TO_PTR(j->id), j); + if (r < 0) + goto rollback; + } + + while ((j = hashmap_steal_first(tr->jobs))) { + Job *installed_job; + + /* Clean the job dependencies */ + transaction_unlink_job(tr, j, false); + + /* When RestartMode=direct is used, the service being restarted don't enter the inactive/failed + * state, i.e. unit_process_job -> job_finish_and_invalidate is never called, and the previous + * job might still be running (especially for Type=oneshot services). We need to refuse + * late merge and re-enqueue the anchor job. */ + installed_job = job_install(j, + /* refuse_late_merge = */ mode == JOB_RESTART_DEPENDENCIES && j == tr->anchor_job); + if (installed_job != j) { + /* j has been merged into a previously installed job */ + if (tr->anchor_job == j) + tr->anchor_job = installed_job; + + hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j); + free_and_replace_full(j, installed_job, job_free); + } + + job_add_to_run_queue(j); + job_add_to_dbus_queue(j); + job_start_timer(j, false); + job_shutdown_magic(j); + + /* When 'affected' is specified, let's track all in it all jobs that were touched because of + * this transaction. */ + if (affected_jobs) + (void) set_put(affected_jobs, j); + } + + return 0; + +rollback: + + HASHMAP_FOREACH(j, tr->jobs) + hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j); + + return r; +} + +int transaction_activate( + Transaction *tr, + Manager *m, + JobMode mode, + Set *affected_jobs, + sd_bus_error *e) { + + Job *j; + int r; + unsigned generation = 1; + + assert(tr); + + /* This applies the changes recorded in tr->jobs to + * the actual list of jobs, if possible. */ + + /* Reset the generation counter of all installed jobs. The detection of cycles + * looks at installed jobs. If they had a non-zero generation from some previous + * walk of the graph, the algorithm would break. */ + HASHMAP_FOREACH(j, m->jobs) + j->generation = 0; + + /* First step: figure out which jobs matter */ + transaction_find_jobs_that_matter_to_anchor(tr->anchor_job, generation++); + + /* Second step: Try not to stop any running services if + * we don't have to. Don't try to reverse running + * jobs if we don't have to. */ + if (mode == JOB_FAIL) + transaction_minimize_impact(tr); + + /* Third step: Drop redundant jobs */ + transaction_drop_redundant(tr); + + for (;;) { + /* Fourth step: Let's remove unneeded jobs that might + * be lurking. */ + if (mode != JOB_ISOLATE) + transaction_collect_garbage(tr); + + /* Fifth step: verify order makes sense and correct + * cycles if necessary and possible */ + r = transaction_verify_order(tr, &generation, e); + if (r >= 0) + break; + + if (r != -EAGAIN) + return log_warning_errno(r, "Requested transaction contains an unfixable cyclic ordering dependency: %s", bus_error_message(e, r)); + + /* Let's see if the resulting transaction ordering + * graph is still cyclic... */ + } + + for (;;) { + /* Sixth step: let's drop unmergeable entries if + * necessary and possible, merge entries we can + * merge */ + r = transaction_merge_jobs(tr, e); + if (r >= 0) + break; + + if (r != -EAGAIN) + return log_warning_errno(r, "Requested transaction contains unmergeable jobs: %s", bus_error_message(e, r)); + + /* Seventh step: an entry got dropped, let's garbage + * collect its dependencies. */ + if (mode != JOB_ISOLATE) + transaction_collect_garbage(tr); + + /* Let's see if the resulting transaction still has + * unmergeable entries ... */ + } + + /* Eights step: Drop redundant jobs again, if the merging now allows us to drop more. */ + transaction_drop_redundant(tr); + + /* Ninth step: check whether we can actually apply this */ + r = transaction_is_destructive(tr, mode, e); + if (r < 0) + return log_notice_errno(r, "Requested transaction contradicts existing jobs: %s", bus_error_message(e, r)); + + /* Tenth step: apply changes */ + r = transaction_apply(tr, m, mode, affected_jobs); + if (r < 0) + return log_warning_errno(r, "Failed to apply transaction: %m"); + + assert(hashmap_isempty(tr->jobs)); + + /* Are there any jobs now? Then make sure we have the idle pipe around. We don't really care too much + * whether this works or not, as the idle pipe is a feature for cosmetics, not actually useful for + * anything beyond that. */ + if (!hashmap_isempty(m->jobs)) + (void) manager_allocate_idle_pipe(m); + + return 0; +} + +static Job* transaction_add_one_job(Transaction *tr, JobType type, Unit *unit, bool *is_new) { + Job *j, *f; + + assert(tr); + assert(unit); + + /* Looks for an existing prospective job and returns that. If + * it doesn't exist it is created and added to the prospective + * jobs list. */ + + f = hashmap_get(tr->jobs, unit); + + LIST_FOREACH(transaction, i, f) { + assert(i->unit == unit); + + if (i->type == type) { + if (is_new) + *is_new = false; + return i; + } + } + + j = job_new(unit, type); + if (!j) + return NULL; + + j->generation = 0; + j->marker = NULL; + j->matters_to_anchor = false; + j->irreversible = tr->irreversible; + + LIST_PREPEND(transaction, f, j); + + if (hashmap_replace(tr->jobs, unit, f) < 0) { + LIST_REMOVE(transaction, f, j); + job_free(j); + return NULL; + } + + if (is_new) + *is_new = true; + + log_trace("Added job %s/%s to transaction.", unit->id, job_type_to_string(type)); + + return j; +} + +static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies) { + assert(tr); + assert(j); + + if (j->transaction_prev) + j->transaction_prev->transaction_next = j->transaction_next; + else if (j->transaction_next) + hashmap_replace(tr->jobs, j->unit, j->transaction_next); + else + hashmap_remove_value(tr->jobs, j->unit, j); + + if (j->transaction_next) + j->transaction_next->transaction_prev = j->transaction_prev; + + j->transaction_prev = j->transaction_next = NULL; + + while (j->subject_list) + job_dependency_free(j->subject_list); + + while (j->object_list) { + Job *other = j->object_list->matters ? j->object_list->subject : NULL; + + job_dependency_free(j->object_list); + + if (other && delete_dependencies) { + log_unit_debug(other->unit, + "Deleting job %s/%s as dependency of job %s/%s", + other->unit->id, job_type_to_string(other->type), + j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, other, delete_dependencies); + } + } +} + +void transaction_add_propagate_reload_jobs( + Transaction *tr, + Unit *unit, + Job *by, + TransactionAddFlags flags) { + + JobType nt; + Unit *dep; + int r; + + assert(tr); + assert(unit); + + UNIT_FOREACH_DEPENDENCY(dep, unit, UNIT_ATOM_PROPAGATES_RELOAD_TO) { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + + nt = job_type_collapse(JOB_TRY_RELOAD, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, by, flags, &e); + if (r < 0) + log_unit_warning(dep, + "Cannot add dependency reload job, ignoring: %s", + bus_error_message(&e, r)); + } +} + +static JobType job_type_propagate_stop_graceful(Job *j) { + JobType type; + + if (!j) + return JOB_STOP; + + type = JOB_STOP; + + LIST_FOREACH(transaction, i, j) + switch (i->type) { + + case JOB_STOP: + case JOB_RESTART: + /* Nothing to worry about, an appropriate job is in-place */ + return JOB_NOP; + + case JOB_START: + /* This unit is pulled in by other dependency types in this transaction. We will run + * into job type conflict if we enqueue a stop job, so let's enqueue a restart job + * instead. */ + type = JOB_RESTART; + break; + + default: /* We don't care about others */ + ; + + } + + return type; +} + +int transaction_add_job_and_dependencies( + Transaction *tr, + JobType type, + Unit *unit, + Job *by, + TransactionAddFlags flags, + sd_bus_error *e) { + + bool is_new; + Job *ret; + int r; + + assert(tr); + assert(type < _JOB_TYPE_MAX); + assert(type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(unit); + + /* Before adding jobs for this unit, let's ensure that its state has been loaded This matters when + * jobs are spawned as part of coldplugging itself (see e. g. path_coldplug()). This way, we + * "recursively" coldplug units, ensuring that we do not look at state of not-yet-coldplugged + * units. */ + if (MANAGER_IS_RELOADING(unit->manager)) + unit_coldplug(unit); + + if (by) + log_trace("Pulling in %s/%s from %s/%s", unit->id, job_type_to_string(type), by->unit->id, job_type_to_string(by->type)); + + /* Safety check that the unit is a valid state, i.e. not in UNIT_STUB or UNIT_MERGED which should only be set + * temporarily. */ + if (!UNIT_IS_LOAD_COMPLETE(unit->load_state)) + return sd_bus_error_setf(e, BUS_ERROR_LOAD_FAILED, "Unit %s is not loaded properly.", unit->id); + + if (type != JOB_STOP) { + r = bus_unit_validate_load_state(unit, e); + /* The time-based cache allows to start new units without daemon-reload, but if they are + * already referenced (because of dependencies or ordering) then we have to force a load of + * the fragment. As an optimization, check first if anything in the usual paths was modified + * since the last time the cache was loaded. Also check if the last time an attempt to load + * the unit was made was before the most recent cache refresh, so that we know we need to try + * again — even if the cache is current, it might have been updated in a different context + * before we had a chance to retry loading this particular unit. + * + * Given building up the transaction is a synchronous operation, attempt + * to load the unit immediately. */ + if (r < 0 && manager_unit_cache_should_retry_load(unit)) { + sd_bus_error_free(e); + unit->load_state = UNIT_STUB; + r = unit_load(unit); + if (r < 0 || unit->load_state == UNIT_STUB) + unit->load_state = UNIT_NOT_FOUND; + r = bus_unit_validate_load_state(unit, e); + } + if (r < 0) + return r; + } + + if (!unit_job_is_applicable(unit, type)) + return sd_bus_error_setf(e, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE, + "Job type %s is not applicable for unit %s.", + job_type_to_string(type), unit->id); + + /* First add the job. */ + ret = transaction_add_one_job(tr, type, unit, &is_new); + if (!ret) + return -ENOMEM; + + if (FLAGS_SET(flags, TRANSACTION_IGNORE_ORDER)) + ret->ignore_order = true; + + /* Then, add a link to the job. */ + if (by) { + if (!job_dependency_new(by, ret, FLAGS_SET(flags, TRANSACTION_MATTERS), FLAGS_SET(flags, TRANSACTION_CONFLICTS))) + return -ENOMEM; + } else { + /* If the job has no parent job, it is the anchor job. */ + assert(!tr->anchor_job); + tr->anchor_job = ret; + } + + if (!is_new || FLAGS_SET(flags, TRANSACTION_IGNORE_REQUIREMENTS) || type == JOB_NOP) + return 0; + + _cleanup_set_free_ Set *following = NULL; + Unit *dep; + + /* If we are following some other unit, make sure we add all dependencies of everybody following. */ + if (unit_following_set(ret->unit, &following) > 0) + SET_FOREACH(dep, following) { + r = transaction_add_job_and_dependencies(tr, type, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e); + if (r < 0) { + log_unit_full_errno(dep, r == -ERFKILL ? LOG_INFO : LOG_WARNING, r, + "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + /* Finally, recursively add in all dependencies. */ + if (IN_SET(type, JOB_START, JOB_RESTART)) { + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START_IGNORED) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e); + if (r < 0) { + /* unit masked, job type not applicable and unit not found are not considered + * as errors. */ + log_unit_full_errno(dep, + IN_SET(r, -ERFKILL, -EBADR, -ENOENT) ? LOG_DEBUG : LOG_WARNING, + r, "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_VERIFY) { + r = transaction_add_job_and_dependencies(tr, JOB_VERIFY_ACTIVE, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP) { + r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, TRANSACTION_MATTERS | TRANSACTION_CONFLICTS | (flags & TRANSACTION_IGNORE_ORDER), e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP_IGNORED) { + r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e); + if (r < 0) { + log_unit_warning(dep, + "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + } + + if (IN_SET(type, JOB_RESTART, JOB_STOP) || (type == JOB_START && FLAGS_SET(flags, TRANSACTION_PROPAGATE_START_AS_RESTART))) { + bool is_stop = type == JOB_STOP; + + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, is_stop ? UNIT_ATOM_PROPAGATE_STOP : UNIT_ATOM_PROPAGATE_RESTART) { + /* We propagate RESTART only as TRY_RESTART, in order not to start dependencies that + * are not around. */ + JobType nt; + + nt = job_type_collapse(is_stop ? JOB_STOP : JOB_TRY_RESTART, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + return r; + + sd_bus_error_free(e); + } + } + + /* Process UNIT_ATOM_PROPAGATE_STOP_GRACEFUL (PropagatesStopTo=) units. We need to wait until + * all other dependencies are processed, i.e. we're the anchor job or already in the recursion + * that handles it. */ + if (!by || FLAGS_SET(flags, TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL)) + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PROPAGATE_STOP_GRACEFUL) { + JobType nt; + Job *j; + + j = hashmap_get(tr->jobs, dep); + nt = job_type_propagate_stop_graceful(j); + + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER) | TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + return r; + + sd_bus_error_free(e); + } + } + } + + if (type == JOB_RELOAD) + transaction_add_propagate_reload_jobs(tr, ret->unit, ret, flags & TRANSACTION_IGNORE_ORDER); + + /* JOB_VERIFY_ACTIVE requires no dependency handling */ + + return 0; + +fail: + /* Recursive call failed to add required jobs so let's drop top level job as well. */ + log_unit_debug_errno(unit, r, "Cannot add dependency job to transaction, deleting job %s/%s again: %s", + unit->id, job_type_to_string(type), bus_error_message(e, r)); + + transaction_delete_job(tr, ret, /* delete_dependencies= */ false); + return r; +} + +static bool shall_stop_on_isolate(Transaction *tr, Unit *u) { + assert(tr); + assert(u); + + if (u->ignore_on_isolate) + return false; + + /* Is there already something listed for this? */ + if (hashmap_contains(tr->jobs, u)) + return false; + + return true; +} + +int transaction_add_isolate_jobs(Transaction *tr, Manager *m) { + Unit *u; + char *k; + int r; + + assert(tr); + assert(m); + + HASHMAP_FOREACH_KEY(u, k, m->units) { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + Unit *o; + + /* Ignore aliases */ + if (u->id != k) + continue; + + /* No need to stop inactive units */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) && !u->job) + continue; + + if (!shall_stop_on_isolate(tr, u)) + continue; + + /* Keep units that are triggered by units we want to keep around. */ + bool keep = false; + UNIT_FOREACH_DEPENDENCY(o, u, UNIT_ATOM_TRIGGERED_BY) + if (!shall_stop_on_isolate(tr, o)) { + keep = true; + break; + } + if (keep) + continue; + + r = transaction_add_job_and_dependencies(tr, JOB_STOP, u, tr->anchor_job, TRANSACTION_MATTERS, &e); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add isolate job, ignoring: %s", bus_error_message(&e, r)); + } + + return 0; +} + +int transaction_add_triggering_jobs(Transaction *tr, Unit *u) { + Unit *trigger; + int r; + + assert(tr); + assert(u); + + UNIT_FOREACH_DEPENDENCY(trigger, u, UNIT_ATOM_TRIGGERED_BY) { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + + /* No need to stop inactive jobs */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger)) && !trigger->job) + continue; + + /* Is there already something listed for this? */ + if (hashmap_contains(tr->jobs, trigger)) + continue; + + r = transaction_add_job_and_dependencies(tr, JOB_STOP, trigger, tr->anchor_job, TRANSACTION_MATTERS, &e); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add triggered by job, ignoring: %s", bus_error_message(&e, r)); + } + + return 0; +} + +Transaction *transaction_new(bool irreversible) { + Transaction *tr; + + tr = new0(Transaction, 1); + if (!tr) + return NULL; + + tr->jobs = hashmap_new(NULL); + if (!tr->jobs) + return mfree(tr); + + tr->irreversible = irreversible; + + return tr; +} + +Transaction *transaction_free(Transaction *tr) { + if (!tr) + return NULL; + + assert(hashmap_isempty(tr->jobs)); + hashmap_free(tr->jobs); + + return mfree(tr); +} + +Transaction *transaction_abort_and_free(Transaction *tr) { + if (!tr) + return NULL; + + transaction_abort(tr); + + return transaction_free(tr); +} diff --git a/src/core/transaction.h b/src/core/transaction.h new file mode 100644 index 0000000..151e02d --- /dev/null +++ b/src/core/transaction.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Transaction Transaction; + +#include "hashmap.h" +#include "job.h" +#include "manager.h" +#include "unit.h" + +struct Transaction { + /* Jobs to be added */ + Hashmap *jobs; /* Unit object => Job object list 1:1 */ + Job *anchor_job; /* the job the user asked for */ + bool irreversible; +}; + +Transaction *transaction_new(bool irreversible); +Transaction *transaction_free(Transaction *tr); +Transaction *transaction_abort_and_free(Transaction *tr); +DEFINE_TRIVIAL_CLEANUP_FUNC(Transaction*, transaction_abort_and_free); + +typedef enum TransactionAddFlags { + TRANSACTION_MATTERS = 1 << 0, + TRANSACTION_CONFLICTS = 1 << 1, + TRANSACTION_IGNORE_REQUIREMENTS = 1 << 2, + TRANSACTION_IGNORE_ORDER = 1 << 3, + + /* Propagate a START job to other units like a RESTART */ + TRANSACTION_PROPAGATE_START_AS_RESTART = 1 << 4, + + /* Indicate that we're in the recursion for processing UNIT_ATOM_PROPAGATE_STOP_GRACEFUL units */ + TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL = 1 << 5, +} TransactionAddFlags; + +void transaction_add_propagate_reload_jobs( + Transaction *tr, + Unit *unit, Job *by, + TransactionAddFlags flags); + +int transaction_add_job_and_dependencies( + Transaction *tr, + JobType type, + Unit *unit, + Job *by, + TransactionAddFlags flags, + sd_bus_error *e); + +int transaction_activate(Transaction *tr, Manager *m, JobMode mode, Set *affected, sd_bus_error *e); +int transaction_add_isolate_jobs(Transaction *tr, Manager *m); +int transaction_add_triggering_jobs(Transaction *tr, Unit *u); diff --git a/src/core/unit-dependency-atom.c b/src/core/unit-dependency-atom.c new file mode 100644 index 0000000..35b279b --- /dev/null +++ b/src/core/unit-dependency-atom.c @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "unit-dependency-atom.h" + +static const UnitDependencyAtom atom_map[_UNIT_DEPENDENCY_MAX] = { + /* A table that maps high-level dependency types to low-level dependency "atoms". The latter actually + * describe specific facets of dependency behaviour. The former combine them into one user-facing + * concept. Atoms are a bit mask, though a bunch of dependency types have only a single bit set. + * + * Typically when the user configures a dependency they go via dependency type, but when we act on + * them we go by atom. + * + * NB: when you add a new dependency type here, make sure to also add one to the (best-effort) + * reverse table in unit_dependency_from_unique_atom() further down. */ + + [UNIT_REQUIRES] = UNIT_ATOM_PULL_IN_START | + UNIT_ATOM_RETROACTIVE_START_REPLACE | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, + + [UNIT_REQUISITE] = UNIT_ATOM_PULL_IN_VERIFY | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, + + [UNIT_WANTS] = UNIT_ATOM_PULL_IN_START_IGNORED | + UNIT_ATOM_RETROACTIVE_START_FAIL | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, + + [UNIT_BINDS_TO] = UNIT_ATOM_PULL_IN_START | + UNIT_ATOM_RETROACTIVE_START_REPLACE | + UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, + + [UNIT_PART_OF] = UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, + + [UNIT_UPHOLDS] = UNIT_ATOM_PULL_IN_START_IGNORED | + UNIT_ATOM_RETROACTIVE_START_REPLACE | + UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, + + [UNIT_REQUIRED_BY] = UNIT_ATOM_PROPAGATE_STOP | + UNIT_ATOM_PROPAGATE_RESTART | + UNIT_ATOM_PROPAGATE_START_FAILURE | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, + + [UNIT_REQUISITE_OF] = UNIT_ATOM_PROPAGATE_STOP | + UNIT_ATOM_PROPAGATE_RESTART | + UNIT_ATOM_PROPAGATE_START_FAILURE | + UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, + + [UNIT_WANTED_BY] = UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED, + + [UNIT_BOUND_BY] = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP | + UNIT_ATOM_PROPAGATE_STOP | + UNIT_ATOM_PROPAGATE_RESTART | + UNIT_ATOM_PROPAGATE_START_FAILURE | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED | + UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, + + [UNIT_UPHELD_BY] = UNIT_ATOM_START_STEADILY | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED, + + [UNIT_CONSISTS_OF] = UNIT_ATOM_PROPAGATE_STOP | + UNIT_ATOM_PROPAGATE_RESTART, + + [UNIT_CONFLICTS] = UNIT_ATOM_PULL_IN_STOP | + UNIT_ATOM_RETROACTIVE_STOP_ON_START, + + [UNIT_CONFLICTED_BY] = UNIT_ATOM_PULL_IN_STOP_IGNORED | + UNIT_ATOM_RETROACTIVE_STOP_ON_START | + UNIT_ATOM_PROPAGATE_STOP_FAILURE, + + [UNIT_PROPAGATES_STOP_TO] = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP | + UNIT_ATOM_PROPAGATE_STOP_GRACEFUL, + + /* These are simple dependency types: they consist of a single atom only */ + [UNIT_ON_FAILURE] = UNIT_ATOM_ON_FAILURE, + [UNIT_ON_SUCCESS] = UNIT_ATOM_ON_SUCCESS, + [UNIT_ON_FAILURE_OF] = UNIT_ATOM_ON_FAILURE_OF, + [UNIT_ON_SUCCESS_OF] = UNIT_ATOM_ON_SUCCESS_OF, + [UNIT_BEFORE] = UNIT_ATOM_BEFORE, + [UNIT_AFTER] = UNIT_ATOM_AFTER, + [UNIT_TRIGGERS] = UNIT_ATOM_TRIGGERS, + [UNIT_TRIGGERED_BY] = UNIT_ATOM_TRIGGERED_BY, + [UNIT_PROPAGATES_RELOAD_TO] = UNIT_ATOM_PROPAGATES_RELOAD_TO, + [UNIT_JOINS_NAMESPACE_OF] = UNIT_ATOM_JOINS_NAMESPACE_OF, + [UNIT_REFERENCES] = UNIT_ATOM_REFERENCES, + [UNIT_REFERENCED_BY] = UNIT_ATOM_REFERENCED_BY, + [UNIT_IN_SLICE] = UNIT_ATOM_IN_SLICE, + [UNIT_SLICE_OF] = UNIT_ATOM_SLICE_OF, + + /* These are dependency types without effect on our state engine. We maintain them only to make + * things discoverable/debuggable as they are the inverse dependencies to some of the above. As they + * have no effect of their own, they all map to no atoms at all, i.e. the value 0. */ + [UNIT_RELOAD_PROPAGATED_FROM] = 0, + [UNIT_STOP_PROPAGATED_FROM] = 0, +}; + +UnitDependencyAtom unit_dependency_to_atom(UnitDependency d) { + if (d < 0) + return _UNIT_DEPENDENCY_ATOM_INVALID; + + assert(d < _UNIT_DEPENDENCY_MAX); + + return atom_map[d]; +} + +UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom) { + + /* This is a "best-effort" function that maps the specified 'atom' mask to a dependency type that is + * is equal to or has a superset of bits set if that's uniquely possible. The idea is that this + * function is used when iterating through deps that have a specific atom: if there's exactly one + * dependency type of the specific atom we don't need iterate through all deps a unit has, but can + * pinpoint things directly. + * + * This function will return _UNIT_DEPENDENCY_INVALID in case the specified value is not known or not + * uniquely defined, i.e. there are multiple dependencies with the atom or the combination set. */ + + switch ((int64_t) atom) { + + /* Note that we can't list UNIT_REQUIRES here since it's a true subset of UNIT_BINDS_TO, and + * hence its atom bits not uniquely mappable. */ + + case UNIT_ATOM_PULL_IN_VERIFY | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE: + case UNIT_ATOM_PULL_IN_VERIFY: /* a single dep type uses this atom */ + return UNIT_REQUISITE; + + case UNIT_ATOM_PULL_IN_START_IGNORED | + UNIT_ATOM_RETROACTIVE_START_FAIL | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE: + case UNIT_ATOM_RETROACTIVE_START_FAIL: + return UNIT_WANTS; + + case UNIT_ATOM_PULL_IN_START | + UNIT_ATOM_RETROACTIVE_START_REPLACE | + UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE: + case UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT: + return UNIT_BINDS_TO; + + case UNIT_ATOM_PULL_IN_START_IGNORED | + UNIT_ATOM_RETROACTIVE_START_REPLACE | + UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE | + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE | + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE: + case UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE: + return UNIT_UPHOLDS; + + case UNIT_ATOM_PROPAGATE_STOP | + UNIT_ATOM_PROPAGATE_RESTART | + UNIT_ATOM_PROPAGATE_START_FAILURE | + UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES: + case UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE: + return UNIT_REQUISITE_OF; + + case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP | + UNIT_ATOM_PROPAGATE_STOP | + UNIT_ATOM_PROPAGATE_RESTART | + UNIT_ATOM_PROPAGATE_START_FAILURE | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED | + UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES: + case UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE: + return UNIT_BOUND_BY; + + case UNIT_ATOM_START_STEADILY | + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES | + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED: + case UNIT_ATOM_START_STEADILY: + return UNIT_UPHELD_BY; + + case UNIT_ATOM_PULL_IN_STOP | + UNIT_ATOM_RETROACTIVE_STOP_ON_START: + case UNIT_ATOM_PULL_IN_STOP: + return UNIT_CONFLICTS; + + case UNIT_ATOM_PULL_IN_STOP_IGNORED | + UNIT_ATOM_RETROACTIVE_STOP_ON_START | + UNIT_ATOM_PROPAGATE_STOP_FAILURE: + case UNIT_ATOM_PULL_IN_STOP_IGNORED: + case UNIT_ATOM_PROPAGATE_STOP_FAILURE: + return UNIT_CONFLICTED_BY; + + case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP | + UNIT_ATOM_PROPAGATE_STOP_GRACEFUL: + case UNIT_ATOM_PROPAGATE_STOP_GRACEFUL: + return UNIT_PROPAGATES_STOP_TO; + + /* And now, the simple ones */ + + case UNIT_ATOM_ON_FAILURE: + return UNIT_ON_FAILURE; + + case UNIT_ATOM_ON_SUCCESS: + return UNIT_ON_SUCCESS; + + case UNIT_ATOM_ON_SUCCESS_OF: + return UNIT_ON_SUCCESS_OF; + + case UNIT_ATOM_ON_FAILURE_OF: + return UNIT_ON_FAILURE_OF; + + case UNIT_ATOM_BEFORE: + return UNIT_BEFORE; + + case UNIT_ATOM_AFTER: + return UNIT_AFTER; + + case UNIT_ATOM_TRIGGERS: + return UNIT_TRIGGERS; + + case UNIT_ATOM_TRIGGERED_BY: + return UNIT_TRIGGERED_BY; + + case UNIT_ATOM_PROPAGATES_RELOAD_TO: + return UNIT_PROPAGATES_RELOAD_TO; + + case UNIT_ATOM_JOINS_NAMESPACE_OF: + return UNIT_JOINS_NAMESPACE_OF; + + case UNIT_ATOM_REFERENCES: + return UNIT_REFERENCES; + + case UNIT_ATOM_REFERENCED_BY: + return UNIT_REFERENCED_BY; + + case UNIT_ATOM_IN_SLICE: + return UNIT_IN_SLICE; + + case UNIT_ATOM_SLICE_OF: + return UNIT_SLICE_OF; + + default: + return _UNIT_DEPENDENCY_INVALID; + } +} diff --git a/src/core/unit-dependency-atom.h b/src/core/unit-dependency-atom.h new file mode 100644 index 0000000..96f00ca --- /dev/null +++ b/src/core/unit-dependency-atom.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "unit-def.h" + +/* Flags that identify the various "atomic" behaviours a specific dependency type implies. Each dependency is + * a combination of one or more of these flags that define what they actually entail. */ +typedef enum UnitDependencyAtom { + + /* This unit pulls in the other unit as JOB_START job into the transaction, and if that doesn't work + * the transaction fails. */ + UNIT_ATOM_PULL_IN_START = UINT64_C(1) << 0, + /* Similar, but if it doesn't work, ignore. */ + UNIT_ATOM_PULL_IN_START_IGNORED = UINT64_C(1) << 1, + /* Pull in a JOB_VERIFY job into the transaction, i.e. pull in JOB_VERIFY rather than + * JOB_START. i.e. check the unit is started but don't pull it in. */ + UNIT_ATOM_PULL_IN_VERIFY = UINT64_C(1) << 2, + + /* Pull in a JOB_STOP job for the other job into transactions, and fail if that doesn't work. */ + UNIT_ATOM_PULL_IN_STOP = UINT64_C(1) << 3, + /* Same, but don't fail, ignore it. */ + UNIT_ATOM_PULL_IN_STOP_IGNORED = UINT64_C(1) << 4, + + /* If our enters inactive state, add the other unit to the StopWhenUneeded= queue */ + UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE = UINT64_C(1) << 5, + /* Pin the other unit i.e. ensure StopWhenUneeded= won't trigger for the other unit as long as we are + * not in inactive state */ + UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED = UINT64_C(1) << 6, + + /* Stop our unit if the other unit happens to inactive */ + UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT = UINT64_C(1) << 7, + /* If our unit enters inactive state, add the other unit to the BoundBy= queue */ + UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE = UINT64_C(1) << 8, + + /* Start this unit whenever we find it inactive and the other unit active */ + UNIT_ATOM_START_STEADILY = UINT64_C(1) << 9, + /* Whenever our unit becomes active, add other unit to start_when_upheld_queue */ + UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE = UINT64_C(1) << 10, + + /* If our unit unexpectedly becomes active, retroactively start the other unit too, in "replace" job + * mode */ + UNIT_ATOM_RETROACTIVE_START_REPLACE = UINT64_C(1) << 11, + /* Similar, but in "fail" job mode */ + UNIT_ATOM_RETROACTIVE_START_FAIL = UINT64_C(1) << 12, + /* If our unit unexpectedly becomes active, retroactively stop the other unit too */ + UNIT_ATOM_RETROACTIVE_STOP_ON_START = UINT64_C(1) << 13, + /* If our unit unexpectedly becomes inactive, retroactively stop the other unit too */ + UNIT_ATOM_RETROACTIVE_STOP_ON_STOP = UINT64_C(1) << 14, + + /* If a start job for this unit fails, propagate the failure to start job of other unit too */ + UNIT_ATOM_PROPAGATE_START_FAILURE = UINT64_C(1) << 15, + /* If a stop job for this unit fails, propagate the failure to any stop job of the other unit too */ + UNIT_ATOM_PROPAGATE_STOP_FAILURE = UINT64_C(1) << 16, + /* If our start job succeeded but the unit is inactive then (think: oneshot units), propagate this as + * failure to the other unit. */ + UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE = UINT64_C(1) << 17, + /* When putting together a transaction, propagate JOB_STOP from our unit to the other. */ + UNIT_ATOM_PROPAGATE_STOP = UINT64_C(1) << 18, + /* Like UNIT_ATOM_PROPAGATE_STOP, but enqueues a restart job if there's already a start job (avoids + * job type conflict). */ + UNIT_ATOM_PROPAGATE_STOP_GRACEFUL = UINT64_C(1) << 19, + /* When putting together a transaction, propagate JOB_RESTART from our unit to the other. */ + UNIT_ATOM_PROPAGATE_RESTART = UINT64_C(1) << 20, + + /* Add the other unit to the default target dependency queue */ + UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE = UINT64_C(1) << 21, + /* Recheck default target deps on other units (which are target units) */ + UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES = UINT64_C(1) << 22, + + /* The remaining atoms map 1:1 to the equally named high-level deps */ + UNIT_ATOM_ON_FAILURE = UINT64_C(1) << 23, + UNIT_ATOM_ON_SUCCESS = UINT64_C(1) << 24, + UNIT_ATOM_ON_FAILURE_OF = UINT64_C(1) << 25, + UNIT_ATOM_ON_SUCCESS_OF = UINT64_C(1) << 26, + UNIT_ATOM_BEFORE = UINT64_C(1) << 27, + UNIT_ATOM_AFTER = UINT64_C(1) << 28, + UNIT_ATOM_TRIGGERS = UINT64_C(1) << 29, + UNIT_ATOM_TRIGGERED_BY = UINT64_C(1) << 30, + UNIT_ATOM_PROPAGATES_RELOAD_TO = UINT64_C(1) << 31, + UNIT_ATOM_JOINS_NAMESPACE_OF = UINT64_C(1) << 32, + UNIT_ATOM_REFERENCES = UINT64_C(1) << 33, + UNIT_ATOM_REFERENCED_BY = UINT64_C(1) << 34, + UNIT_ATOM_IN_SLICE = UINT64_C(1) << 35, + UNIT_ATOM_SLICE_OF = UINT64_C(1) << 36, + _UNIT_DEPENDENCY_ATOM_MAX = (UINT64_C(1) << 37) - 1, + _UNIT_DEPENDENCY_ATOM_INVALID = -EINVAL, +} UnitDependencyAtom; + +UnitDependencyAtom unit_dependency_to_atom(UnitDependency d); +UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom); diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c new file mode 100644 index 0000000..9f95984 --- /dev/null +++ b/src/core/unit-printf.c @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "format-util.h" +#include "macro.h" +#include "specifier.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "unit.h" +#include "user-util.h" + +static int specifier_prefix_and_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + + return unit_name_to_prefix_and_instance(u->id, ret); +} + +static int specifier_prefix(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + + return unit_name_to_prefix(u->id, ret); +} + +static int specifier_prefix_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + _cleanup_free_ char *p = NULL; + const Unit *u = ASSERT_PTR(userdata); + int r; + + r = unit_name_to_prefix(u->id, &p); + if (r < 0) + return r; + + return unit_name_unescape(p, ret); +} + +static int specifier_instance_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + + return unit_name_unescape(strempty(u->instance), ret); +} + +static int specifier_last_component(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + _cleanup_free_ char *prefix = NULL; + char *dash; + int r; + + r = unit_name_to_prefix(u->id, &prefix); + if (r < 0) + return r; + + dash = strrchr(prefix, '-'); + if (dash) + return specifier_string(specifier, dash + 1, root, userdata, ret); + + *ret = TAKE_PTR(prefix); + return 0; +} + +static int specifier_last_component_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + r = specifier_last_component(specifier, data, root, userdata, &p); + if (r < 0) + return r; + + return unit_name_unescape(p, ret); +} + +static int specifier_filename(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + + if (u->instance) + return unit_name_path_unescape(u->instance, ret); + else + return unit_name_to_path(u->id, ret); +} + +static void bad_specifier(const Unit *u, char specifier) { + log_unit_warning(u, "Specifier '%%%c' used in unit configuration, which is deprecated. Please update your unit file, as it does not work as intended.", specifier); +} + +static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + + bad_specifier(u, specifier); + + if (u->cgroup_path) { + char *n; + + n = strdup(u->cgroup_path); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; + } + + return unit_default_cgroup_path(u, ret); +} + +static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + char *n; + + bad_specifier(u, specifier); + + n = strdup(u->manager->cgroup_root); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_cgroup_slice(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata), *slice; + char *n; + + bad_specifier(u, specifier); + + slice = UNIT_GET_SLICE(u); + if (slice) { + if (slice->cgroup_path) + n = strdup(slice->cgroup_path); + else + return unit_default_cgroup_path(slice, ret); + } else + n = strdup(u->manager->cgroup_root); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_special_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + char *n; + + n = strdup(u->manager->prefix[PTR_TO_UINT(data)]); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_credentials_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + char *d; + + assert(ret); + + d = strjoin(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id); + if (!d) + return -ENOMEM; + + *ret = d; + return 0; +} + +int unit_name_printf(const Unit *u, const char* format, char **ret) { + /* + * This will use the passed string as format string and replace the following specifiers (which should all be + * safe for inclusion in unit names): + * + * %n: the full id of the unit (foo-aaa@bar.waldo) + * %N: the id of the unit without the suffix (foo-aaa@bar) + * %p: the prefix (foo-aaa) + * %i: the instance (bar) + * %j: the last component of the prefix (aaa) + */ + + const Specifier table[] = { + { 'i', specifier_string, u->instance }, + { 'j', specifier_last_component, NULL }, + { 'n', specifier_string, u->id }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + + COMMON_SYSTEM_SPECIFIERS, + + COMMON_CREDS_SPECIFIERS(u->manager->runtime_scope), + {} + }; + + assert(u); + assert(format); + assert(ret); + + return specifier_printf(format, UNIT_NAME_MAX, table, NULL, u, ret); +} + +int unit_full_printf_full(const Unit *u, const char *format, size_t max_length, char **ret) { + /* This is similar to unit_name_printf() but also supports unescaping. Also, adds a couple of + * additional codes (which are likely not suitable for unescaped inclusion in unit names): + * + * %f: the unescaped instance if set, otherwise the id unescaped as path + * + * %c: cgroup path of unit (deprecated) + * %r: where units in this slice are placed in the cgroup tree (deprecated) + * %R: the root of this systemd's instance tree (deprecated) + * + * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME) + * %d: the credentials directory ($CREDENTIALS_DIRECTORY) + * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME) + * %L: the log directory root (e.g. /var/log or $XDG_STATE_HOME/log) + * %S: the state directory root (e.g. /var/lib or $XDG_STATE_HOME) + * %t: the runtime directory root (e.g. /run or $XDG_RUNTIME_DIR) + * + * %h: the homedir of the running user + * %s: the shell of the running user + * + * NOTICE: When you add new entries here, please be careful: specifiers which depend on settings of + * the unit file itself are broken by design, as they would resolve differently depending on whether + * they are used before or after the relevant configuration setting. Hence: don't add them. + */ + + assert(u); + assert(format); + assert(ret); + + const Specifier table[] = { + { 'i', specifier_string, u->instance }, + { 'I', specifier_instance_unescaped, NULL }, + { 'j', specifier_last_component, NULL }, + { 'J', specifier_last_component_unescaped, NULL }, + { 'n', specifier_string, u->id }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + { 'P', specifier_prefix_unescaped, NULL }, + + { 'f', specifier_filename, NULL }, + { 'y', specifier_real_path, u->fragment_path }, + { 'Y', specifier_real_directory, u->fragment_path }, + + { 'c', specifier_cgroup, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */ + { 'r', specifier_cgroup_slice, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */ + { 'R', specifier_cgroup_root, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */ + + { 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) }, + { 'd', specifier_credentials_dir, NULL }, + { 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) }, + { 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) }, + { 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) }, + { 't', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_RUNTIME) }, + + { 'h', specifier_user_home, NULL }, + { 's', specifier_user_shell, NULL }, + + COMMON_SYSTEM_SPECIFIERS, + + COMMON_CREDS_SPECIFIERS(u->manager->runtime_scope), + + COMMON_TMP_SPECIFIERS, + {} + }; + + return specifier_printf(format, max_length, table, NULL, u, ret); +} diff --git a/src/core/unit-printf.h b/src/core/unit-printf.h new file mode 100644 index 0000000..2df07db --- /dev/null +++ b/src/core/unit-printf.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "creds-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "unit.h" + +int unit_name_printf(const Unit *u, const char* text, char **ret); +int unit_full_printf_full(const Unit *u, const char *text, size_t max_length, char **ret); +static inline int unit_full_printf(const Unit *u, const char *text, char **ret) { + return unit_full_printf_full(u, text, LONG_LINE_MAX, ret); +} +static inline int unit_path_printf(const Unit *u, const char *text, char **ret) { + return unit_full_printf_full(u, text, PATH_MAX-1, ret); +} +static inline int unit_fd_printf(const Unit *u, const char *text, char **ret) { + return unit_full_printf_full(u, text, FDNAME_MAX, ret); +} +static inline int unit_cred_printf(const Unit *u, const char *text, char **ret) { + return unit_full_printf_full(u, text, CREDENTIAL_NAME_MAX, ret); +} +static inline int unit_env_printf(const Unit *u, const char *text, char **ret) { + return unit_full_printf_full(u, text, sc_arg_max(), ret); +} diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c new file mode 100644 index 0000000..fe4221c --- /dev/null +++ b/src/core/unit-serialize.c @@ -0,0 +1,890 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-socket-bind.h" +#include "bus-util.h" +#include "dbus.h" +#include "fileio-label.h" +#include "fileio.h" +#include "format-util.h" +#include "parse-util.h" +#include "restrict-ifaces.h" +#include "serialize.h" +#include "string-table.h" +#include "unit-serialize.h" +#include "user-util.h" + +static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) { + _cleanup_free_ char *s = NULL; + int r; + + assert(f); + assert(key); + + if (mask == 0) + return 0; + + r = cg_mask_to_string(mask, &s); + if (r < 0) + return log_error_errno(r, "Failed to format cgroup mask: %m"); + + return serialize_item(f, key, s); +} + +/* Make sure out values fit in the bitfield. */ +assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8); + +static int serialize_markers(FILE *f, unsigned markers) { + assert(f); + + if (markers == 0) + return 0; + + fputs("markers=", f); + for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++) + if (FLAGS_SET(markers, 1u << m)) + fputs(unit_marker_to_string(m), f); + fputc('\n', f); + return 0; +} + +static int deserialize_markers(Unit *u, const char *value) { + assert(u); + assert(value); + int r; + + for (const char *p = value;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r <= 0) + return r; + + UnitMarker m = unit_marker_from_string(word); + if (m < 0) { + log_unit_debug_errno(u, m, "Unknown unit marker \"%s\", ignoring.", word); + continue; + } + + u->markers |= 1u << m; + } +} + +static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes", + [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets", + [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes", + [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric); + +static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base", + [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base", + [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base", + [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric); + +static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last", + [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last", + [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last", + [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric); + +static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = { + [CGROUP_MEMORY_PEAK] = "memory-accounting-peak", + [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric); + +int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) { + int r; + + assert(u); + assert(f); + assert(fds); + + if (switching_root && UNIT_VTABLE(u)->exclude_from_switch_root_serialization) { + /* In the new root, paths for mounts and automounts will be different, so it doesn't make + * much sense to serialize things. API file systems will be moved to the new root, but we + * don't have mount units for those. */ + log_unit_debug(u, "not serializing before switch-root"); + return 0; + } + + /* Start marker */ + fputs(u->id, f); + fputc('\n', f); + + assert(!!UNIT_VTABLE(u)->serialize == !!UNIT_VTABLE(u)->deserialize_item); + + if (UNIT_VTABLE(u)->serialize) { + r = UNIT_VTABLE(u)->serialize(u, f, fds); + if (r < 0) + return r; + } + + (void) serialize_dual_timestamp(f, "state-change-timestamp", &u->state_change_timestamp); + + (void) serialize_dual_timestamp(f, "inactive-exit-timestamp", &u->inactive_exit_timestamp); + (void) serialize_dual_timestamp(f, "active-enter-timestamp", &u->active_enter_timestamp); + (void) serialize_dual_timestamp(f, "active-exit-timestamp", &u->active_exit_timestamp); + (void) serialize_dual_timestamp(f, "inactive-enter-timestamp", &u->inactive_enter_timestamp); + + (void) serialize_dual_timestamp(f, "condition-timestamp", &u->condition_timestamp); + (void) serialize_dual_timestamp(f, "assert-timestamp", &u->assert_timestamp); + + (void) serialize_ratelimit(f, "start-ratelimit", &u->start_ratelimit); + (void) serialize_ratelimit(f, "auto-start-stop-ratelimit", &u->auto_start_stop_ratelimit); + + if (dual_timestamp_is_set(&u->condition_timestamp)) + (void) serialize_bool(f, "condition-result", u->condition_result); + + if (dual_timestamp_is_set(&u->assert_timestamp)) + (void) serialize_bool(f, "assert-result", u->assert_result); + + (void) serialize_bool(f, "transient", u->transient); + (void) serialize_bool(f, "in-audit", u->in_audit); + + (void) serialize_bool(f, "exported-invocation-id", u->exported_invocation_id); + (void) serialize_bool(f, "exported-log-level-max", u->exported_log_level_max); + (void) serialize_bool(f, "exported-log-extra-fields", u->exported_log_extra_fields); + (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval); + (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst); + + (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base); + if (u->cpu_usage_last != NSEC_INFINITY) + (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last); + + if (u->managed_oom_kill_last > 0) + (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last); + + if (u->oom_kill_last > 0) + (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last); + + for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) { + (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, u->io_accounting_base[im]); + + if (u->io_accounting_last[im] != UINT64_MAX) + (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, u->io_accounting_last[im]); + } + + for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) { + uint64_t v; + + r = unit_get_memory_accounting(u, metric, &v); + if (r >= 0) + (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v); + } + + if (u->cgroup_path) + (void) serialize_item(f, "cgroup", u->cgroup_path); + + (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized); + (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask); + (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask); + (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask); + + (void) bpf_serialize_socket_bind(u, f, fds); + + (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", u->ip_bpf_ingress_installed); + (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", u->ip_bpf_egress_installed); + (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", u->bpf_device_control_installed); + (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", u->ip_bpf_custom_ingress_installed); + (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", u->ip_bpf_custom_egress_installed); + + (void) serialize_restrict_network_interfaces(u, f, fds); + + if (uid_is_valid(u->ref_uid)) + (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid); + if (gid_is_valid(u->ref_gid)) + (void) serialize_item_format(f, "ref-gid", GID_FMT, u->ref_gid); + + if (!sd_id128_is_null(u->invocation_id)) + (void) serialize_item_format(f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)); + + (void) serialize_item_format(f, "freezer-state", "%s", freezer_state_to_string(unit_freezer_state(u))); + (void) serialize_markers(f, u->markers); + + bus_track_serialize(u->bus_track, f, "ref"); + + for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + uint64_t v; + + r = unit_get_ip_accounting(u, m, &v); + if (r >= 0) + (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v); + } + + if (!switching_root) { + if (u->job) { + fputs("job\n", f); + job_serialize(u->job, f); + } + + if (u->nop_job) { + fputs("job\n", f); + job_serialize(u->nop_job, f); + } + } + + /* End marker */ + fputc('\n', f); + return 0; +} + +static int unit_deserialize_job(Unit *u, FILE *f) { + _cleanup_(job_freep) Job *j = NULL; + int r; + + assert(u); + assert(f); + + j = job_new_raw(u); + if (!j) + return log_oom(); + + r = job_deserialize(j, f); + if (r < 0) + return r; + + r = job_install_deserialized(j); + if (r < 0) + return r; + + TAKE_PTR(j); + return 0; +} + +#define MATCH_DESERIALIZE(key, l, v, parse_func, target) \ + ({ \ + bool _deserialize_matched = streq(l, key); \ + if (_deserialize_matched) { \ + int _deserialize_r = parse_func(v); \ + if (_deserialize_r < 0) \ + log_unit_debug_errno(u, _deserialize_r, \ + "Failed to parse \"%s=%s\", ignoring.", l, v); \ + else \ + target = _deserialize_r; \ + }; \ + _deserialize_matched; \ + }) + +#define MATCH_DESERIALIZE_IMMEDIATE(key, l, v, parse_func, target) \ + ({ \ + bool _deserialize_matched = streq(l, key); \ + if (_deserialize_matched) { \ + int _deserialize_r = parse_func(v, &target); \ + if (_deserialize_r < 0) \ + log_unit_debug_errno(u, _deserialize_r, \ + "Failed to parse \"%s=%s\", ignoring", l, v); \ + }; \ + _deserialize_matched; \ + }) + +int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + assert(f); + assert(fds); + + for (;;) { + _cleanup_free_ char *l = NULL; + ssize_t m; + size_t k; + char *v; + + r = deserialize_read_line(f, &l); + if (r < 0) + return r; + if (r == 0) /* eof or end marker */ + break; + + k = strcspn(l, "="); + + if (l[k] == '=') { + l[k] = 0; + v = l+k+1; + } else + v = l+k; + + if (streq(l, "job")) { + if (v[0] == '\0') { + /* New-style serialized job */ + r = unit_deserialize_job(u, f); + if (r < 0) + return r; + } else /* Legacy for pre-44 */ + log_unit_warning(u, "Update from too old systemd versions are unsupported, cannot deserialize job: %s", v); + continue; + } else if (streq(l, "state-change-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->state_change_timestamp); + continue; + } else if (streq(l, "inactive-exit-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->inactive_exit_timestamp); + continue; + } else if (streq(l, "active-enter-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->active_enter_timestamp); + continue; + } else if (streq(l, "active-exit-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->active_exit_timestamp); + continue; + } else if (streq(l, "inactive-enter-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->inactive_enter_timestamp); + continue; + } else if (streq(l, "condition-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->condition_timestamp); + continue; + } else if (streq(l, "assert-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->assert_timestamp); + continue; + + } else if (streq(l, "start-ratelimit")) { + deserialize_ratelimit(&u->start_ratelimit, l, v); + continue; + } else if (streq(l, "auto-start-stop-ratelimit")) { + deserialize_ratelimit(&u->auto_start_stop_ratelimit, l, v); + continue; + + } else if (MATCH_DESERIALIZE("condition-result", l, v, parse_boolean, u->condition_result)) + continue; + + else if (MATCH_DESERIALIZE("assert-result", l, v, parse_boolean, u->assert_result)) + continue; + + else if (MATCH_DESERIALIZE("transient", l, v, parse_boolean, u->transient)) + continue; + + else if (MATCH_DESERIALIZE("in-audit", l, v, parse_boolean, u->in_audit)) + continue; + + else if (MATCH_DESERIALIZE("exported-invocation-id", l, v, parse_boolean, u->exported_invocation_id)) + continue; + + else if (MATCH_DESERIALIZE("exported-log-level-max", l, v, parse_boolean, u->exported_log_level_max)) + continue; + + else if (MATCH_DESERIALIZE("exported-log-extra-fields", l, v, parse_boolean, u->exported_log_extra_fields)) + continue; + + else if (MATCH_DESERIALIZE("exported-log-rate-limit-interval", l, v, parse_boolean, u->exported_log_ratelimit_interval)) + continue; + + else if (MATCH_DESERIALIZE("exported-log-rate-limit-burst", l, v, parse_boolean, u->exported_log_ratelimit_burst)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-base", l, v, safe_atou64, u->cpu_usage_base) || + MATCH_DESERIALIZE_IMMEDIATE("cpuacct-usage-base", l, v, safe_atou64, u->cpu_usage_base)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-last", l, v, safe_atou64, u->cpu_usage_last)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("managed-oom-kill-last", l, v, safe_atou64, u->managed_oom_kill_last)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("oom-kill-last", l, v, safe_atou64, u->oom_kill_last)) + continue; + + else if (streq(l, "cgroup")) { + r = unit_set_cgroup_path(u, v); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v); + + (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); + + continue; + + } else if (MATCH_DESERIALIZE("cgroup-realized", l, v, parse_boolean, u->cgroup_realized)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-realized-mask", l, v, cg_mask_from_string, u->cgroup_realized_mask)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-enabled-mask", l, v, cg_mask_from_string, u->cgroup_enabled_mask)) + continue; + + else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-invalidated-mask", l, v, cg_mask_from_string, u->cgroup_invalidated_mask)) + continue; + + else if (STR_IN_SET(l, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) { + int fd; + + fd = deserialize_fd(fds, v); + if (fd >= 0) + (void) bpf_socket_bind_add_initial_link_fd(u, fd); + continue; + + } else if (streq(l, "ip-bpf-ingress-installed")) { + (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_ingress_installed); + continue; + } else if (streq(l, "ip-bpf-egress-installed")) { + (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_egress_installed); + continue; + } else if (streq(l, "bpf-device-control-installed")) { + (void) bpf_program_deserialize_attachment(v, fds, &u->bpf_device_control_installed); + continue; + + } else if (streq(l, "ip-bpf-custom-ingress-installed")) { + (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_ingress_installed); + continue; + } else if (streq(l, "ip-bpf-custom-egress-installed")) { + (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_egress_installed); + continue; + + } else if (streq(l, "restrict-ifaces-bpf-fd")) { + int fd; + + fd = deserialize_fd(fds, v); + if (fd >= 0) + (void) restrict_network_interfaces_add_initial_link_fd(u, fd); + + continue; + + } else if (streq(l, "ref-uid")) { + uid_t uid; + + r = parse_uid(v, &uid); + if (r < 0) + log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v); + else + unit_ref_uid_gid(u, uid, GID_INVALID); + continue; + + } else if (streq(l, "ref-gid")) { + gid_t gid; + + r = parse_gid(v, &gid); + if (r < 0) + log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v); + else + unit_ref_uid_gid(u, UID_INVALID, gid); + continue; + + } else if (streq(l, "ref")) { + r = strv_extend(&u->deserialized_refs, v); + if (r < 0) + return log_oom(); + continue; + + } else if (streq(l, "invocation-id")) { + sd_id128_t id; + + r = sd_id128_from_string(v, &id); + if (r < 0) + log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v); + else { + r = unit_set_invocation_id(u, id); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m"); + } + + continue; + + } else if (MATCH_DESERIALIZE("freezer-state", l, v, freezer_state_from_string, u->freezer_state)) + continue; + + else if (streq(l, "markers")) { + r = deserialize_markers(u, v); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to deserialize \"%s=%s\", ignoring: %m", l, v); + continue; + } + + m = memory_accounting_metric_field_last_from_string(l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", v); + else + u->memory_accounting_last[m] = c; + continue; + } + + /* Check if this is an IP accounting metric serialization field */ + m = ip_accounting_metric_field_from_string(l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v); + else + u->ip_accounting_extra[m] = c; + continue; + } + + m = io_accounting_metric_field_base_from_string(l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v); + else + u->io_accounting_base[m] = c; + continue; + } + + m = io_accounting_metric_field_last_from_string(l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v); + else + u->io_accounting_last[m] = c; + continue; + } + + r = exec_shared_runtime_deserialize_compat(u, l, v, fds); + if (r < 0) { + log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l); + continue; + } else if (r > 0) + /* Returns positive if key was handled by the call */ + continue; + + if (UNIT_VTABLE(u)->deserialize_item) { + r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds); + if (r < 0) + log_unit_warning(u, "Failed to deserialize unit parameter '%s', ignoring.", l); + } + } + + /* Versions before 228 did not carry a state change timestamp. In this case, take the current + * time. This is useful, so that timeouts based on this timestamp don't trigger too early, and is + * in-line with the logic from before 228 where the base for timeouts was not persistent across + * reboots. */ + + if (!dual_timestamp_is_set(&u->state_change_timestamp)) + dual_timestamp_now(&u->state_change_timestamp); + + /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings + * applied after we are done. For that we invalidate anything already realized, so that we can + * realize it again. */ + if (u->cgroup_realized) { + unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); + unit_invalidate_cgroup_bpf(u); + } + + return 0; +} + +int unit_deserialize_state_skip(FILE *f) { + int r; + + assert(f); + + /* Skip serialized data for this unit. We don't know what it is. */ + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + return 0; + + /* End marker */ + if (isempty(line)) + return 1; + } +} + +static void print_unit_dependency_mask(FILE *f, const char *kind, UnitDependencyMask mask, bool *space) { + const struct { + UnitDependencyMask mask; + const char *name; + } table[] = { + { UNIT_DEPENDENCY_FILE, "file" }, + { UNIT_DEPENDENCY_IMPLICIT, "implicit" }, + { UNIT_DEPENDENCY_DEFAULT, "default" }, + { UNIT_DEPENDENCY_UDEV, "udev" }, + { UNIT_DEPENDENCY_PATH, "path" }, + { UNIT_DEPENDENCY_MOUNT_FILE, "mount-file" }, + { UNIT_DEPENDENCY_MOUNTINFO, "mountinfo" }, + { UNIT_DEPENDENCY_PROC_SWAP, "proc-swap" }, + { UNIT_DEPENDENCY_SLICE_PROPERTY, "slice-property" }, + }; + + assert(f); + assert(kind); + assert(space); + + for (size_t i = 0; i < ELEMENTSOF(table); i++) { + + if (mask == 0) + break; + + if (FLAGS_SET(mask, table[i].mask)) { + if (*space) + fputc(' ', f); + else + *space = true; + + fputs(kind, f); + fputs("-", f); + fputs(table[i].name, f); + + mask &= ~table[i].mask; + } + } + + assert(mask == 0); +} + +void unit_dump(Unit *u, FILE *f, const char *prefix) { + char *t; + const char *prefix2; + Unit *following; + _cleanup_set_free_ Set *following_set = NULL; + CGroupMask m; + int r; + + assert(u); + assert(u->type >= 0); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%s-> Unit %s:\n", + prefix, u->id); + + SET_FOREACH(t, u->aliases) + fprintf(f, "%s\tAlias: %s\n", prefix, t); + + fprintf(f, + "%s\tDescription: %s\n" + "%s\tInstance: %s\n" + "%s\tUnit Load State: %s\n" + "%s\tUnit Active State: %s\n" + "%s\tState Change Timestamp: %s\n" + "%s\tInactive Exit Timestamp: %s\n" + "%s\tActive Enter Timestamp: %s\n" + "%s\tActive Exit Timestamp: %s\n" + "%s\tInactive Enter Timestamp: %s\n" + "%s\tMay GC: %s\n" + "%s\tNeed Daemon Reload: %s\n" + "%s\tTransient: %s\n" + "%s\tPerpetual: %s\n" + "%s\tGarbage Collection Mode: %s\n", + prefix, unit_description(u), + prefix, strna(u->instance), + prefix, unit_load_state_to_string(u->load_state), + prefix, unit_active_state_to_string(unit_active_state(u)), + prefix, strna(FORMAT_TIMESTAMP(u->state_change_timestamp.realtime)), + prefix, strna(FORMAT_TIMESTAMP(u->inactive_exit_timestamp.realtime)), + prefix, strna(FORMAT_TIMESTAMP(u->active_enter_timestamp.realtime)), + prefix, strna(FORMAT_TIMESTAMP(u->active_exit_timestamp.realtime)), + prefix, strna(FORMAT_TIMESTAMP(u->inactive_enter_timestamp.realtime)), + prefix, yes_no(unit_may_gc(u)), + prefix, yes_no(unit_need_daemon_reload(u)), + prefix, yes_no(u->transient), + prefix, yes_no(u->perpetual), + prefix, collect_mode_to_string(u->collect_mode)); + + if (u->markers != 0) { + fprintf(f, "%s\tMarkers:", prefix); + + for (UnitMarker marker = 0; marker < _UNIT_MARKER_MAX; marker++) + if (FLAGS_SET(u->markers, 1u << marker)) + fprintf(f, " %s", unit_marker_to_string(marker)); + fputs("\n", f); + } + + if (UNIT_HAS_CGROUP_CONTEXT(u)) { + fprintf(f, + "%s\tSlice: %s\n" + "%s\tCGroup: %s\n" + "%s\tCGroup realized: %s\n", + prefix, strna(unit_slice_name(u)), + prefix, strna(u->cgroup_path), + prefix, yes_no(u->cgroup_realized)); + + if (u->cgroup_realized_mask != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(u->cgroup_realized_mask, &s); + fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s)); + } + + if (u->cgroup_enabled_mask != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(u->cgroup_enabled_mask, &s); + fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_own_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_members_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_delegate_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s)); + } + } + + if (!sd_id128_is_null(u->invocation_id)) + fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n", + prefix, SD_ID128_FORMAT_VAL(u->invocation_id)); + + STRV_FOREACH(j, u->documentation) + fprintf(f, "%s\tDocumentation: %s\n", prefix, *j); + + if (u->access_selinux_context) + fprintf(f, "%s\tAccess SELinux Context: %s\n", prefix, u->access_selinux_context); + + following = unit_following(u); + if (following) + fprintf(f, "%s\tFollowing: %s\n", prefix, following->id); + + r = unit_following_set(u, &following_set); + if (r >= 0) { + Unit *other; + + SET_FOREACH(other, following_set) + fprintf(f, "%s\tFollowing Set Member: %s\n", prefix, other->id); + } + + if (u->fragment_path) + fprintf(f, "%s\tFragment Path: %s\n", prefix, u->fragment_path); + + if (u->source_path) + fprintf(f, "%s\tSource Path: %s\n", prefix, u->source_path); + + STRV_FOREACH(j, u->dropin_paths) + fprintf(f, "%s\tDropIn Path: %s\n", prefix, *j); + + if (u->failure_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tFailure Action: %s\n", prefix, emergency_action_to_string(u->failure_action)); + if (u->failure_action_exit_status >= 0) + fprintf(f, "%s\tFailure Action Exit Status: %i\n", prefix, u->failure_action_exit_status); + if (u->success_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tSuccess Action: %s\n", prefix, emergency_action_to_string(u->success_action)); + if (u->success_action_exit_status >= 0) + fprintf(f, "%s\tSuccess Action Exit Status: %i\n", prefix, u->success_action_exit_status); + + if (u->job_timeout != USEC_INFINITY) + fprintf(f, "%s\tJob Timeout: %s\n", prefix, FORMAT_TIMESPAN(u->job_timeout, 0)); + + if (u->job_timeout_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tJob Timeout Action: %s\n", prefix, emergency_action_to_string(u->job_timeout_action)); + + if (u->job_timeout_reboot_arg) + fprintf(f, "%s\tJob Timeout Reboot Argument: %s\n", prefix, u->job_timeout_reboot_arg); + + condition_dump_list(u->conditions, f, prefix, condition_type_to_string); + condition_dump_list(u->asserts, f, prefix, assert_type_to_string); + + if (dual_timestamp_is_set(&u->condition_timestamp)) + fprintf(f, + "%s\tCondition Timestamp: %s\n" + "%s\tCondition Result: %s\n", + prefix, strna(FORMAT_TIMESTAMP(u->condition_timestamp.realtime)), + prefix, yes_no(u->condition_result)); + + if (dual_timestamp_is_set(&u->assert_timestamp)) + fprintf(f, + "%s\tAssert Timestamp: %s\n" + "%s\tAssert Result: %s\n", + prefix, strna(FORMAT_TIMESTAMP(u->assert_timestamp.realtime)), + prefix, yes_no(u->assert_result)); + + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + UnitDependencyInfo di; + Unit *other; + + HASHMAP_FOREACH_KEY(di.data, other, unit_get_dependencies(u, d)) { + bool space = false; + + fprintf(f, "%s\t%s: %s (", prefix, unit_dependency_to_string(d), other->id); + + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + + fputs(")\n", f); + } + } + + if (!hashmap_isempty(u->requires_mounts_for)) { + UnitDependencyInfo di; + const char *path; + + HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) { + bool space = false; + + fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path); + + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + + fputs(")\n", f); + } + } + + if (u->load_state == UNIT_LOADED) { + + fprintf(f, + "%s\tStopWhenUnneeded: %s\n" + "%s\tRefuseManualStart: %s\n" + "%s\tRefuseManualStop: %s\n" + "%s\tDefaultDependencies: %s\n" + "%s\tSurviveFinalKillSignal: %s\n" + "%s\tOnSuccessJobMode: %s\n" + "%s\tOnFailureJobMode: %s\n" + "%s\tIgnoreOnIsolate: %s\n", + prefix, yes_no(u->stop_when_unneeded), + prefix, yes_no(u->refuse_manual_start), + prefix, yes_no(u->refuse_manual_stop), + prefix, yes_no(u->default_dependencies), + prefix, yes_no(u->survive_final_kill_signal), + prefix, job_mode_to_string(u->on_success_job_mode), + prefix, job_mode_to_string(u->on_failure_job_mode), + prefix, yes_no(u->ignore_on_isolate)); + + if (UNIT_VTABLE(u)->dump) + UNIT_VTABLE(u)->dump(u, f, prefix2); + + } else if (u->load_state == UNIT_MERGED) + fprintf(f, + "%s\tMerged into: %s\n", + prefix, u->merged_into->id); + else if (u->load_state == UNIT_ERROR) { + errno = abs(u->load_error); + fprintf(f, "%s\tLoad Error Code: %m\n", prefix); + } + + for (const char *n = sd_bus_track_first(u->bus_track); n; n = sd_bus_track_next(u->bus_track)) + fprintf(f, "%s\tBus Ref: %s\n", prefix, n); + + if (u->job) + job_dump(u->job, f, prefix2); + + if (u->nop_job) + job_dump(u->nop_job, f, prefix2); +} diff --git a/src/core/unit-serialize.h b/src/core/unit-serialize.h new file mode 100644 index 0000000..ab8a8e3 --- /dev/null +++ b/src/core/unit-serialize.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "unit.h" +#include "fdset.h" + +/* These functions serialize state for our own usage, i.e.: across a reload/reexec, rather than for being + * passed to a child process. */ + +int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs); +int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds); +int unit_deserialize_state_skip(FILE *f); + +void unit_dump(Unit *u, FILE *f, const char *prefix); diff --git a/src/core/unit.c b/src/core/unit.c new file mode 100644 index 0000000..2fc9f5a --- /dev/null +++ b/src/core/unit.c @@ -0,0 +1,6617 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-id128.h" +#include "sd-messages.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-foreign.h" +#include "bpf-socket-bind.h" +#include "bus-common-errors.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "chase.h" +#include "core-varlink.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "dropin.h" +#include "env-util.h" +#include "escape.h" +#include "exec-credential.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "fileio.h" +#include "format-util.h" +#include "id128-util.h" +#include "install.h" +#include "iovec-util.h" +#include "label-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "logarithm.h" +#include "macro.h" +#include "mkdir-label.h" +#include "path-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "serialize.h" +#include "set.h" +#include "signal-util.h" +#include "sparse-endian.h" +#include "special.h" +#include "specifier.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" +#if BPF_FRAMEWORK +#include "bpf-link.h" +#endif + +/* Thresholds for logging at INFO level about resource consumption */ +#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC) +#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL) +#define MENTIONWORTHY_IP_BYTES (0ULL) + +/* Thresholds for logging at INFO level about resource consumption */ +#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */ +#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL) /* 10 MB */ +#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */ + +const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = { + [UNIT_SERVICE] = &service_vtable, + [UNIT_SOCKET] = &socket_vtable, + [UNIT_TARGET] = &target_vtable, + [UNIT_DEVICE] = &device_vtable, + [UNIT_MOUNT] = &mount_vtable, + [UNIT_AUTOMOUNT] = &automount_vtable, + [UNIT_SWAP] = &swap_vtable, + [UNIT_TIMER] = &timer_vtable, + [UNIT_PATH] = &path_vtable, + [UNIT_SLICE] = &slice_vtable, + [UNIT_SCOPE] = &scope_vtable, +}; + +Unit* unit_new(Manager *m, size_t size) { + Unit *u; + + assert(m); + assert(size >= sizeof(Unit)); + + u = malloc0(size); + if (!u) + return NULL; + + u->manager = m; + u->type = _UNIT_TYPE_INVALID; + u->default_dependencies = true; + u->unit_file_state = _UNIT_FILE_STATE_INVALID; + u->unit_file_preset = -1; + u->on_failure_job_mode = JOB_REPLACE; + u->on_success_job_mode = JOB_FAIL; + u->cgroup_control_inotify_wd = -1; + u->cgroup_memory_inotify_wd = -1; + u->job_timeout = USEC_INFINITY; + u->job_running_timeout = USEC_INFINITY; + u->ref_uid = UID_INVALID; + u->ref_gid = GID_INVALID; + u->cpu_usage_last = NSEC_INFINITY; + + unit_reset_memory_accounting_last(u); + + unit_reset_io_accounting_last(u); + + u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + u->failure_action_exit_status = u->success_action_exit_status = -1; + + u->ip_accounting_ingress_map_fd = -EBADF; + u->ip_accounting_egress_map_fd = -EBADF; + + u->ipv4_allow_map_fd = -EBADF; + u->ipv6_allow_map_fd = -EBADF; + u->ipv4_deny_map_fd = -EBADF; + u->ipv6_deny_map_fd = -EBADF; + + u->last_section_private = -1; + + u->start_ratelimit = (const RateLimit) { + m->defaults.start_limit_interval, + m->defaults.start_limit_burst, + }; + + u->auto_start_stop_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_SEC, .burst = 16 }; + + return u; +} + +int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret) { + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + u = unit_new(m, size); + if (!u) + return -ENOMEM; + + r = unit_add_name(u, name); + if (r < 0) + return r; + + *ret = TAKE_PTR(u); + + return r; +} + +bool unit_has_name(const Unit *u, const char *name) { + assert(u); + assert(name); + + return streq_ptr(name, u->id) || + set_contains(u->aliases, name); +} + +static void unit_init(Unit *u) { + CGroupContext *cc; + ExecContext *ec; + KillContext *kc; + + assert(u); + assert(u->manager); + assert(u->type >= 0); + + cc = unit_get_cgroup_context(u); + if (cc) { + cgroup_context_init(cc); + + /* Copy in the manager defaults into the cgroup + * context, _before_ the rest of the settings have + * been initialized */ + + cc->cpu_accounting = u->manager->defaults.cpu_accounting; + cc->io_accounting = u->manager->defaults.io_accounting; + cc->blockio_accounting = u->manager->defaults.blockio_accounting; + cc->memory_accounting = u->manager->defaults.memory_accounting; + cc->tasks_accounting = u->manager->defaults.tasks_accounting; + cc->ip_accounting = u->manager->defaults.ip_accounting; + + if (u->type != UNIT_SLICE) + cc->tasks_max = u->manager->defaults.tasks_max; + + cc->memory_pressure_watch = u->manager->defaults.memory_pressure_watch; + cc->memory_pressure_threshold_usec = u->manager->defaults.memory_pressure_threshold_usec; + } + + ec = unit_get_exec_context(u); + if (ec) { + exec_context_init(ec); + + if (u->manager->defaults.oom_score_adjust_set) { + ec->oom_score_adjust = u->manager->defaults.oom_score_adjust; + ec->oom_score_adjust_set = true; + } + + if (MANAGER_IS_SYSTEM(u->manager)) + ec->keyring_mode = EXEC_KEYRING_SHARED; + else { + ec->keyring_mode = EXEC_KEYRING_INHERIT; + + /* User manager might have its umask redefined by PAM or UMask=. In this + * case let the units it manages inherit this value by default. They can + * still tune this value through their own unit file */ + (void) get_process_umask(0, &ec->umask); + } + } + + kc = unit_get_kill_context(u); + if (kc) + kill_context_init(kc); + + if (UNIT_VTABLE(u)->init) + UNIT_VTABLE(u)->init(u); +} + +static int unit_add_alias(Unit *u, char *donated_name) { + int r; + + /* Make sure that u->names is allocated. We may leave u->names + * empty if we fail later, but this is not a problem. */ + r = set_ensure_put(&u->aliases, &string_hash_ops, donated_name); + if (r < 0) + return r; + assert(r > 0); + + return 0; +} + +int unit_add_name(Unit *u, const char *text) { + _cleanup_free_ char *name = NULL, *instance = NULL; + UnitType t; + int r; + + assert(u); + assert(text); + + if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "instance is not set when adding name '%s': %m", text); + + r = unit_name_replace_instance(text, u->instance, &name); + if (r < 0) + return log_unit_debug_errno(u, r, + "failed to build instance name from '%s': %m", text); + } else { + name = strdup(text); + if (!name) + return -ENOMEM; + } + + if (unit_has_name(u, name)) + return 0; + + if (hashmap_contains(u->manager->units, name)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST), + "unit already exist when adding name '%s': %m", name); + + if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "name '%s' is invalid: %m", name); + + t = unit_name_to_type(name); + if (t < 0) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "failed to derive unit type from name '%s': %m", name); + + if (u->type != _UNIT_TYPE_INVALID && t != u->type) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m", + u->type, t, name); + + r = unit_name_to_instance(name, &instance); + if (r < 0) + return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name); + + if (instance && !unit_type_may_template(t)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name); + + /* Ensure that this unit either has no instance, or that the instance matches. */ + if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "cannot add name %s, the instances don't match (\"%s\" != \"%s\").", + name, instance, u->instance); + + if (u->id && !unit_type_may_alias(t)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST), + "cannot add name %s, aliases are not allowed for %s units.", + name, unit_type_to_string(t)); + + if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES) + return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m"); + + /* Add name to the global hashmap first, because that's easier to undo */ + r = hashmap_put(u->manager->units, name, u); + if (r < 0) + return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text); + + if (u->id) { + r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */ + if (r < 0) { + hashmap_remove(u->manager->units, name); + return r; + } + TAKE_PTR(name); + + } else { + /* A new name, we don't need the set yet. */ + assert(u->type == _UNIT_TYPE_INVALID); + assert(!u->instance); + + u->type = t; + u->id = TAKE_PTR(name); + u->instance = TAKE_PTR(instance); + + LIST_PREPEND(units_by_type, u->manager->units_by_type[t], u); + unit_init(u); + } + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_choose_id(Unit *u, const char *name) { + _cleanup_free_ char *t = NULL; + char *s; + int r; + + assert(u); + assert(name); + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(name, u->instance, &t); + if (r < 0) + return r; + + name = t; + } + + if (streq_ptr(u->id, name)) + return 0; /* Nothing to do. */ + + /* Selects one of the aliases of this unit as the id */ + s = set_get(u->aliases, (char*) name); + if (!s) + return -ENOENT; + + if (u->id) { + r = set_remove_and_put(u->aliases, name, u->id); + if (r < 0) + return r; + } else + assert_se(set_remove(u->aliases, name)); /* see set_get() above… */ + + u->id = s; /* Old u->id is now stored in the set, and s is not stored anywhere */ + unit_add_to_dbus_queue(u); + + return 0; +} + +int unit_set_description(Unit *u, const char *description) { + int r; + + assert(u); + + r = free_and_strdup(&u->description, empty_to_null(description)); + if (r < 0) + return r; + if (r > 0) + unit_add_to_dbus_queue(u); + + return 0; +} + +static bool unit_success_failure_handler_has_jobs(Unit *unit) { + Unit *other; + + UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_SUCCESS) + if (other->job || other->nop_job) + return true; + + UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_FAILURE) + if (other->job || other->nop_job) + return true; + + return false; +} + +void unit_release_resources(Unit *u) { + UnitActiveState state; + ExecContext *ec; + + assert(u); + + if (u->job || u->nop_job) + return; + + if (u->perpetual) + return; + + state = unit_active_state(u); + if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED)) + return; + + if (unit_will_restart(u)) + return; + + ec = unit_get_exec_context(u); + if (ec && ec->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART) + exec_context_destroy_runtime_directory(ec, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + if (UNIT_VTABLE(u)->release_resources) + UNIT_VTABLE(u)->release_resources(u); +} + +bool unit_may_gc(Unit *u) { + UnitActiveState state; + int r; + + assert(u); + + /* Checks whether the unit is ready to be unloaded for garbage collection. Returns true when the + * unit may be collected, and false if there's some reason to keep it loaded. + * + * References from other units are *not* checked here. Instead, this is done in unit_gc_sweep(), but + * using markers to properly collect dependency loops. + */ + + if (u->job || u->nop_job) + return false; + + if (u->perpetual) + return false; + + /* if we saw a cgroup empty event for this unit, stay around until we processed it so that we remove + * the empty cgroup if possible. Similar, process any pending OOM events if they are already queued + * before we release the unit. */ + if (u->in_cgroup_empty_queue || u->in_cgroup_oom_queue) + return false; + + /* Make sure to send out D-Bus events before we unload the unit */ + if (u->in_dbus_queue) + return false; + + if (sd_bus_track_count(u->bus_track) > 0) + return false; + + state = unit_active_state(u); + + /* But we keep the unit object around for longer when it is referenced or configured to not be + * gc'ed */ + switch (u->collect_mode) { + + case COLLECT_INACTIVE: + if (state != UNIT_INACTIVE) + return false; + + break; + + case COLLECT_INACTIVE_OR_FAILED: + if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED)) + return false; + + break; + + default: + assert_not_reached(); + } + + /* Check if any OnFailure= or on Success= jobs may be pending */ + if (unit_success_failure_handler_has_jobs(u)) + return false; + + if (u->cgroup_path) { + /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay + * around. Units with active processes should never be collected. */ + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path)); + if (r <= 0) + return false; + } + + if (!UNIT_VTABLE(u)->may_gc) + return true; + + return UNIT_VTABLE(u)->may_gc(u); +} + +void unit_add_to_load_queue(Unit *u) { + assert(u); + assert(u->type != _UNIT_TYPE_INVALID); + + if (u->load_state != UNIT_STUB || u->in_load_queue) + return; + + LIST_PREPEND(load_queue, u->manager->load_queue, u); + u->in_load_queue = true; +} + +void unit_add_to_cleanup_queue(Unit *u) { + assert(u); + + if (u->in_cleanup_queue) + return; + + LIST_PREPEND(cleanup_queue, u->manager->cleanup_queue, u); + u->in_cleanup_queue = true; +} + +void unit_add_to_gc_queue(Unit *u) { + assert(u); + + if (u->in_gc_queue || u->in_cleanup_queue) + return; + + if (!unit_may_gc(u)) + return; + + LIST_PREPEND(gc_queue, u->manager->gc_unit_queue, u); + u->in_gc_queue = true; +} + +void unit_add_to_dbus_queue(Unit *u) { + assert(u); + assert(u->type != _UNIT_TYPE_INVALID); + + if (u->load_state == UNIT_STUB || u->in_dbus_queue) + return; + + /* Shortcut things if nobody cares */ + if (sd_bus_track_count(u->manager->subscribed) <= 0 && + sd_bus_track_count(u->bus_track) <= 0 && + set_isempty(u->manager->private_buses)) { + u->sent_dbus_new_signal = true; + return; + } + + LIST_PREPEND(dbus_queue, u->manager->dbus_unit_queue, u); + u->in_dbus_queue = true; +} + +void unit_submit_to_stop_when_unneeded_queue(Unit *u) { + assert(u); + + if (u->in_stop_when_unneeded_queue) + return; + + if (!u->stop_when_unneeded) + return; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + LIST_PREPEND(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u); + u->in_stop_when_unneeded_queue = true; +} + +void unit_submit_to_start_when_upheld_queue(Unit *u) { + assert(u); + + if (u->in_start_when_upheld_queue) + return; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + return; + + if (!unit_has_dependency(u, UNIT_ATOM_START_STEADILY, NULL)) + return; + + LIST_PREPEND(start_when_upheld_queue, u->manager->start_when_upheld_queue, u); + u->in_start_when_upheld_queue = true; +} + +void unit_submit_to_stop_when_bound_queue(Unit *u) { + assert(u); + + if (u->in_stop_when_bound_queue) + return; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + if (!unit_has_dependency(u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT, NULL)) + return; + + LIST_PREPEND(stop_when_bound_queue, u->manager->stop_when_bound_queue, u); + u->in_stop_when_bound_queue = true; +} + +static bool unit_can_release_resources(Unit *u) { + ExecContext *ec; + + assert(u); + + if (UNIT_VTABLE(u)->release_resources) + return true; + + ec = unit_get_exec_context(u); + if (ec && ec->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART) + return true; + + return false; +} + +void unit_submit_to_release_resources_queue(Unit *u) { + assert(u); + + if (u->in_release_resources_queue) + return; + + if (u->job || u->nop_job) + return; + + if (u->perpetual) + return; + + if (!unit_can_release_resources(u)) + return; + + LIST_PREPEND(release_resources_queue, u->manager->release_resources_queue, u); + u->in_release_resources_queue = true; +} + +static void unit_clear_dependencies(Unit *u) { + assert(u); + + /* Removes all dependencies configured on u and their reverse dependencies. */ + + for (Hashmap *deps; (deps = hashmap_steal_first(u->dependencies));) { + + for (Unit *other; (other = hashmap_steal_first_key(deps));) { + Hashmap *other_deps; + + HASHMAP_FOREACH(other_deps, other->dependencies) + hashmap_remove(other_deps, u); + + unit_add_to_gc_queue(other); + } + + hashmap_free(deps); + } + + u->dependencies = hashmap_free(u->dependencies); +} + +static void unit_remove_transient(Unit *u) { + assert(u); + + if (!u->transient) + return; + + if (u->fragment_path) + (void) unlink(u->fragment_path); + + STRV_FOREACH(i, u->dropin_paths) { + _cleanup_free_ char *p = NULL, *pp = NULL; + + if (path_extract_directory(*i, &p) < 0) /* Get the drop-in directory from the drop-in file */ + continue; + + if (path_extract_directory(p, &pp) < 0) /* Get the config directory from the drop-in directory */ + continue; + + /* Only drop transient drop-ins */ + if (!path_equal(u->manager->lookup_paths.transient, pp)) + continue; + + (void) unlink(*i); + (void) rmdir(p); + } +} + +static void unit_free_requires_mounts_for(Unit *u) { + assert(u); + + for (;;) { + _cleanup_free_ char *path = NULL; + + path = hashmap_steal_first_key(u->requires_mounts_for); + if (!path) + break; + else { + char s[strlen(path) + 1]; + + PATH_FOREACH_PREFIX_MORE(s, path) { + char *y; + Set *x; + + x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y); + if (!x) + continue; + + (void) set_remove(x, u); + + if (set_isempty(x)) { + (void) hashmap_remove(u->manager->units_requiring_mounts_for, y); + free(y); + set_free(x); + } + } + } + } + + u->requires_mounts_for = hashmap_free(u->requires_mounts_for); +} + +static void unit_done(Unit *u) { + ExecContext *ec; + CGroupContext *cc; + + assert(u); + + if (u->type < 0) + return; + + if (UNIT_VTABLE(u)->done) + UNIT_VTABLE(u)->done(u); + + ec = unit_get_exec_context(u); + if (ec) + exec_context_done(ec); + + cc = unit_get_cgroup_context(u); + if (cc) + cgroup_context_done(cc); +} + +Unit* unit_free(Unit *u) { + Unit *slice; + char *t; + + if (!u) + return NULL; + + sd_event_source_disable_unref(u->auto_start_stop_event_source); + + u->transient_file = safe_fclose(u->transient_file); + + if (!MANAGER_IS_RELOADING(u->manager)) + unit_remove_transient(u); + + bus_unit_send_removed_signal(u); + + unit_done(u); + + unit_dequeue_rewatch_pids(u); + + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + u->bus_track = sd_bus_track_unref(u->bus_track); + u->deserialized_refs = strv_free(u->deserialized_refs); + u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation); + + unit_free_requires_mounts_for(u); + + SET_FOREACH(t, u->aliases) + hashmap_remove_value(u->manager->units, t, u); + if (u->id) + hashmap_remove_value(u->manager->units, u->id, u); + + if (!sd_id128_is_null(u->invocation_id)) + hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (u->job) { + Job *j = u->job; + job_uninstall(j); + job_free(j); + } + + if (u->nop_job) { + Job *j = u->nop_job; + job_uninstall(j); + job_free(j); + } + + /* A unit is being dropped from the tree, make sure our family is realized properly. Do this after we + * detach the unit from slice tree in order to eliminate its effect on controller masks. */ + slice = UNIT_GET_SLICE(u); + unit_clear_dependencies(u); + if (slice) + unit_add_family_to_cgroup_realize_queue(slice); + + if (u->on_console) + manager_unref_console(u->manager); + + fdset_free(u->initial_socket_bind_link_fds); +#if BPF_FRAMEWORK + bpf_link_free(u->ipv4_socket_bind_link); + bpf_link_free(u->ipv6_socket_bind_link); +#endif + + unit_release_cgroup(u); + + if (!MANAGER_IS_RELOADING(u->manager)) + unit_unlink_state_files(u); + + unit_unref_uid_gid(u, false); + + (void) manager_update_failed_units(u->manager, u, false); + set_remove(u->manager->startup_units, u); + + unit_unwatch_all_pids(u); + + while (u->refs_by_target) + unit_ref_unset(u->refs_by_target); + + if (u->type != _UNIT_TYPE_INVALID) + LIST_REMOVE(units_by_type, u->manager->units_by_type[u->type], u); + + if (u->in_load_queue) + LIST_REMOVE(load_queue, u->manager->load_queue, u); + + if (u->in_dbus_queue) + LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u); + + if (u->in_cleanup_queue) + LIST_REMOVE(cleanup_queue, u->manager->cleanup_queue, u); + + if (u->in_gc_queue) + LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u); + + if (u->in_cgroup_realize_queue) + LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + + if (u->in_cgroup_empty_queue) + LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + + if (u->in_cgroup_oom_queue) + LIST_REMOVE(cgroup_oom_queue, u->manager->cgroup_oom_queue, u); + + if (u->in_target_deps_queue) + LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u); + + if (u->in_stop_when_unneeded_queue) + LIST_REMOVE(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u); + + if (u->in_start_when_upheld_queue) + LIST_REMOVE(start_when_upheld_queue, u->manager->start_when_upheld_queue, u); + + if (u->in_stop_when_bound_queue) + LIST_REMOVE(stop_when_bound_queue, u->manager->stop_when_bound_queue, u); + + if (u->in_release_resources_queue) + LIST_REMOVE(release_resources_queue, u->manager->release_resources_queue, u); + + bpf_firewall_close(u); + + hashmap_free(u->bpf_foreign_by_key); + + bpf_program_free(u->bpf_device_control_installed); + +#if BPF_FRAMEWORK + bpf_link_free(u->restrict_ifaces_ingress_bpf_link); + bpf_link_free(u->restrict_ifaces_egress_bpf_link); +#endif + fdset_free(u->initial_restric_ifaces_link_fds); + + condition_free_list(u->conditions); + condition_free_list(u->asserts); + + free(u->description); + strv_free(u->documentation); + free(u->fragment_path); + free(u->source_path); + strv_free(u->dropin_paths); + free(u->instance); + + free(u->job_timeout_reboot_arg); + free(u->reboot_arg); + + free(u->access_selinux_context); + + set_free_free(u->aliases); + free(u->id); + + activation_details_unref(u->activation_details); + + return mfree(u); +} + +FreezerState unit_freezer_state(Unit *u) { + assert(u); + + return u->freezer_state; +} + +int unit_freezer_state_kernel(Unit *u, FreezerState *ret) { + char *values[1] = {}; + int r; + + assert(u); + + r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", + STRV_MAKE("frozen"), values); + if (r < 0) + return r; + + r = _FREEZER_STATE_INVALID; + + if (values[0]) { + if (streq(values[0], "0")) + r = FREEZER_RUNNING; + else if (streq(values[0], "1")) + r = FREEZER_FROZEN; + } + + free(values[0]); + *ret = r; + + return 0; +} + +UnitActiveState unit_active_state(Unit *u) { + assert(u); + + if (u->load_state == UNIT_MERGED) + return unit_active_state(unit_follow_merge(u)); + + /* After a reload it might happen that a unit is not correctly + * loaded but still has a process around. That's why we won't + * shortcut failed loading to UNIT_INACTIVE_FAILED. */ + + return UNIT_VTABLE(u)->active_state(u); +} + +const char* unit_sub_state_to_string(Unit *u) { + assert(u); + + return UNIT_VTABLE(u)->sub_state_to_string(u); +} + +static int unit_merge_names(Unit *u, Unit *other) { + char *name; + int r; + + assert(u); + assert(other); + + r = unit_add_alias(u, other->id); + if (r < 0) + return r; + + r = set_move(u->aliases, other->aliases); + if (r < 0) { + set_remove(u->aliases, other->id); + return r; + } + + TAKE_PTR(other->id); + other->aliases = set_free_free(other->aliases); + + SET_FOREACH(name, u->aliases) + assert_se(hashmap_replace(u->manager->units, name, u) == 0); + + return 0; +} + +static int unit_reserve_dependencies(Unit *u, Unit *other) { + size_t n_reserve; + Hashmap* deps; + void *d; + int r; + + assert(u); + assert(other); + + /* Let's reserve some space in the dependency hashmaps so that later on merging the units cannot + * fail. + * + * First make some room in the per dependency type hashmaps. Using the summed size of both units' + * hashmaps is an estimate that is likely too high since they probably use some of the same + * types. But it's never too low, and that's all we need. */ + + n_reserve = MIN(hashmap_size(other->dependencies), LESS_BY((size_t) _UNIT_DEPENDENCY_MAX, hashmap_size(u->dependencies))); + if (n_reserve > 0) { + r = hashmap_ensure_allocated(&u->dependencies, NULL); + if (r < 0) + return r; + + r = hashmap_reserve(u->dependencies, n_reserve); + if (r < 0) + return r; + } + + /* Now, enlarge our per dependency type hashmaps by the number of entries in the same hashmap of the + * other unit's dependencies. + * + * NB: If u does not have a dependency set allocated for some dependency type, there is no need to + * reserve anything for. In that case other's set will be transferred as a whole to u by + * complete_move(). */ + + HASHMAP_FOREACH_KEY(deps, d, u->dependencies) { + Hashmap *other_deps; + + other_deps = hashmap_get(other->dependencies, d); + + r = hashmap_reserve(deps, hashmap_size(other_deps)); + if (r < 0) + return r; + } + + return 0; +} + +static bool unit_should_warn_about_dependency(UnitDependency dependency) { + /* Only warn about some unit types */ + return IN_SET(dependency, + UNIT_CONFLICTS, + UNIT_CONFLICTED_BY, + UNIT_BEFORE, + UNIT_AFTER, + UNIT_ON_SUCCESS, + UNIT_ON_FAILURE, + UNIT_TRIGGERS, + UNIT_TRIGGERED_BY); +} + +static int unit_per_dependency_type_hashmap_update( + Hashmap *per_type, + Unit *other, + UnitDependencyMask origin_mask, + UnitDependencyMask destination_mask) { + + UnitDependencyInfo info; + int r; + + assert(other); + assert_cc(sizeof(void*) == sizeof(info)); + + /* Acquire the UnitDependencyInfo entry for the Unit* we are interested in, and update it if it + * exists, or insert it anew if not. */ + + info.data = hashmap_get(per_type, other); + if (info.data) { + /* Entry already exists. Add in our mask. */ + + if (FLAGS_SET(origin_mask, info.origin_mask) && + FLAGS_SET(destination_mask, info.destination_mask)) + return 0; /* NOP */ + + info.origin_mask |= origin_mask; + info.destination_mask |= destination_mask; + + r = hashmap_update(per_type, other, info.data); + } else { + info = (UnitDependencyInfo) { + .origin_mask = origin_mask, + .destination_mask = destination_mask, + }; + + r = hashmap_put(per_type, other, info.data); + } + if (r < 0) + return r; + + return 1; +} + +static void unit_merge_dependencies(Unit *u, Unit *other) { + Hashmap *deps; + void *dt; /* Actually of type UnitDependency, except that we don't bother casting it here, + * since the hashmaps all want it as void pointer. */ + + assert(u); + assert(other); + + if (u == other) + return; + + /* First, remove dependency to other. */ + HASHMAP_FOREACH_KEY(deps, dt, u->dependencies) { + if (hashmap_remove(deps, other) && unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt))) + log_unit_warning(u, "Dependency %s=%s is dropped, as %s is merged into %s.", + unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)), + other->id, other->id, u->id); + + if (hashmap_isempty(deps)) + hashmap_free(hashmap_remove(u->dependencies, dt)); + } + + for (;;) { + _cleanup_hashmap_free_ Hashmap *other_deps = NULL; + UnitDependencyInfo di_back; + Unit *back; + + /* Let's focus on one dependency type at a time, that 'other' has defined. */ + other_deps = hashmap_steal_first_key_and_value(other->dependencies, &dt); + if (!other_deps) + break; /* done! */ + + deps = hashmap_get(u->dependencies, dt); + + /* Now iterate through all dependencies of this dependency type, of 'other'. We refer to the + * referenced units as 'back'. */ + HASHMAP_FOREACH_KEY(di_back.data, back, other_deps) { + Hashmap *back_deps; + void *back_dt; + + if (back == u) { + /* This is a dependency pointing back to the unit we want to merge with? + * Suppress it (but warn) */ + if (unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt))) + log_unit_warning(u, "Dependency %s=%s in %s is dropped, as %s is merged into %s.", + unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)), + u->id, other->id, other->id, u->id); + + hashmap_remove(other_deps, back); + continue; + } + + /* Now iterate through all deps of 'back', and fix the ones pointing to 'other' to + * point to 'u' instead. */ + HASHMAP_FOREACH_KEY(back_deps, back_dt, back->dependencies) { + UnitDependencyInfo di_move; + + di_move.data = hashmap_remove(back_deps, other); + if (!di_move.data) + continue; + + assert_se(unit_per_dependency_type_hashmap_update( + back_deps, + u, + di_move.origin_mask, + di_move.destination_mask) >= 0); + } + + /* The target unit already has dependencies of this type, let's then merge this individually. */ + if (deps) + assert_se(unit_per_dependency_type_hashmap_update( + deps, + back, + di_back.origin_mask, + di_back.destination_mask) >= 0); + } + + /* Now all references towards 'other' of the current type 'dt' are corrected to point to 'u'. + * Lets's now move the deps of type 'dt' from 'other' to 'u'. If the unit does not have + * dependencies of this type, let's move them per type wholesale. */ + if (!deps) + assert_se(hashmap_put(u->dependencies, dt, TAKE_PTR(other_deps)) >= 0); + } + + other->dependencies = hashmap_free(other->dependencies); +} + +int unit_merge(Unit *u, Unit *other) { + int r; + + assert(u); + assert(other); + assert(u->manager == other->manager); + assert(u->type != _UNIT_TYPE_INVALID); + + other = unit_follow_merge(other); + + if (other == u) + return 0; + + if (u->type != other->type) + return -EINVAL; + + if (!unit_type_may_alias(u->type)) /* Merging only applies to unit names that support aliases */ + return -EEXIST; + + if (!IN_SET(other->load_state, UNIT_STUB, UNIT_NOT_FOUND)) + return -EEXIST; + + if (!streq_ptr(u->instance, other->instance)) + return -EINVAL; + + if (other->job) + return -EEXIST; + + if (other->nop_job) + return -EEXIST; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + return -EEXIST; + + /* Make reservations to ensure merge_dependencies() won't fail. We don't rollback reservations if we + * fail. We don't have a way to undo reservations. A reservation is not a leak. */ + r = unit_reserve_dependencies(u, other); + if (r < 0) + return r; + + /* Redirect all references */ + while (other->refs_by_target) + unit_ref_set(other->refs_by_target, other->refs_by_target->source, u); + + /* Merge dependencies */ + unit_merge_dependencies(u, other); + + /* Merge names. It is better to do that after merging deps, otherwise the log message contains n/a. */ + r = unit_merge_names(u, other); + if (r < 0) + return r; + + other->load_state = UNIT_MERGED; + other->merged_into = u; + + if (!u->activation_details) + u->activation_details = activation_details_ref(other->activation_details); + + /* If there is still some data attached to the other node, we + * don't need it anymore, and can free it. */ + if (other->load_state != UNIT_STUB) + if (UNIT_VTABLE(other)->done) + UNIT_VTABLE(other)->done(other); + + unit_add_to_dbus_queue(u); + unit_add_to_cleanup_queue(other); + + return 0; +} + +int unit_merge_by_name(Unit *u, const char *name) { + _cleanup_free_ char *s = NULL; + Unit *other; + int r; + + /* Either add name to u, or if a unit with name already exists, merge it with u. + * If name is a template, do the same for name@instance, where instance is u's instance. */ + + assert(u); + assert(name); + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(name, u->instance, &s); + if (r < 0) + return r; + + name = s; + } + + other = manager_get_unit(u->manager, name); + if (other) + return unit_merge(u, other); + + return unit_add_name(u, name); +} + +Unit* unit_follow_merge(Unit *u) { + assert(u); + + while (u->load_state == UNIT_MERGED) + assert_se(u = u->merged_into); + + return u; +} + +int unit_add_exec_dependencies(Unit *u, ExecContext *c) { + int r; + + assert(u); + assert(c); + + /* Unlike unit_add_dependency() or friends, this always returns 0 on success. */ + + if (c->working_directory && !c->working_directory_missing_ok) { + r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_directory) { + r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_image) { + r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + if (!u->manager->prefix[dt]) + continue; + + for (size_t i = 0; i < c->directories[dt].n_items; i++) { + _cleanup_free_ char *p = NULL; + + p = path_join(u->manager->prefix[dt], c->directories[dt].items[i].path); + if (!p) + return -ENOMEM; + + r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + } + + if (!MANAGER_IS_SYSTEM(u->manager)) + return 0; + + /* For the following three directory types we need write access, and /var/ is possibly on the root + * fs. Hence order after systemd-remount-fs.service, to ensure things are writable. */ + if (c->directories[EXEC_DIRECTORY_STATE].n_items > 0 || + c->directories[EXEC_DIRECTORY_CACHE].n_items > 0 || + c->directories[EXEC_DIRECTORY_LOGS].n_items > 0) { + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->private_tmp) { + + /* FIXME: for now we make a special case for /tmp and add a weak dependency on + * tmp.mount so /tmp being masked is supported. However there's no reason to treat + * /tmp specifically and masking other mount units should be handled more + * gracefully too, see PR#16894. */ + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "tmp.mount", true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + r = unit_require_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_TMPFILES_SETUP_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_image) { + /* We need to wait for /dev/loopX to appear when doing RootImage=, hence let's add an + * implicit dependency on udev */ + + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_UDEVD_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (!IN_SET(c->std_output, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) && + !IN_SET(c->std_error, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) && + !c->log_namespace) + return 0; + + /* If syslog or kernel logging is requested (or log namespacing is), make sure our own logging daemon + * is run first. */ + + if (c->log_namespace) { + _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL; + + r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } else { + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + r = unit_add_default_credential_dependencies(u, c); + if (r < 0) + return r; + + return 0; +} + +const char* unit_description(Unit *u) { + assert(u); + + if (u->description) + return u->description; + + return strna(u->id); +} + +const char* unit_status_string(Unit *u, char **ret_combined_buffer) { + assert(u); + assert(u->id); + + /* Return u->id, u->description, or "{u->id} - {u->description}". + * Versions with u->description are only used if it is set. + * The last option is used if configured and the caller provided the 'ret_combined_buffer' + * pointer. + * + * Note that *ret_combined_buffer may be set to NULL. */ + + if (!u->description || + u->manager->status_unit_format == STATUS_UNIT_FORMAT_NAME || + (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && !ret_combined_buffer) || + streq(u->description, u->id)) { + + if (ret_combined_buffer) + *ret_combined_buffer = NULL; + return u->id; + } + + if (ret_combined_buffer) { + if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED) { + *ret_combined_buffer = strjoin(u->id, " - ", u->description); + if (*ret_combined_buffer) + return *ret_combined_buffer; + log_oom(); /* Fall back to ->description */ + } else + *ret_combined_buffer = NULL; + } + + return u->description; +} + +/* Common implementation for multiple backends */ +int unit_load_fragment_and_dropin(Unit *u, bool fragment_required) { + int r; + + assert(u); + + /* Load a .{service,socket,...} file */ + r = unit_load_fragment(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_STUB) { + if (fragment_required) + return -ENOENT; + + u->load_state = UNIT_LOADED; + } + + /* Load drop-in directory data. If u is an alias, we might be reloading the + * target unit needlessly. But we cannot be sure which drops-ins have already + * been loaded and which not, at least without doing complicated book-keeping, + * so let's always reread all drop-ins. */ + r = unit_load_dropin(unit_follow_merge(u)); + if (r < 0) + return r; + + if (u->source_path) { + struct stat st; + + if (stat(u->source_path, &st) >= 0) + u->source_mtime = timespec_load(&st.st_mtim); + else + u->source_mtime = 0; + } + + return 0; +} + +void unit_add_to_target_deps_queue(Unit *u) { + Manager *m = ASSERT_PTR(ASSERT_PTR(u)->manager); + + if (u->in_target_deps_queue) + return; + + LIST_PREPEND(target_deps_queue, m->target_deps_queue, u); + u->in_target_deps_queue = true; +} + +int unit_add_default_target_dependency(Unit *u, Unit *target) { + assert(u); + assert(target); + + if (target->type != UNIT_TARGET) + return 0; + + /* Only add the dependency if both units are loaded, so that + * that loop check below is reliable */ + if (u->load_state != UNIT_LOADED || + target->load_state != UNIT_LOADED) + return 0; + + /* If either side wants no automatic dependencies, then let's + * skip this */ + if (!u->default_dependencies || + !target->default_dependencies) + return 0; + + /* Don't create loops */ + if (unit_has_dependency(target, UNIT_ATOM_BEFORE, u)) + return 0; + + return unit_add_dependency(target, UNIT_AFTER, u, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int unit_add_slice_dependencies(Unit *u) { + Unit *slice; + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* Slice units are implicitly ordered against their parent slices (as this relationship is encoded in the + name), while all other units are ordered based on configuration (as in their case Slice= configures the + relationship). */ + UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE; + + slice = UNIT_GET_SLICE(u); + if (slice) + return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, slice, true, mask); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, SPECIAL_ROOT_SLICE, true, mask); +} + +static int unit_add_mount_dependencies(Unit *u) { + UnitDependencyInfo di; + const char *path; + bool changed = false; + int r; + + assert(u); + + HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) { + char prefix[strlen(path) + 1]; + + PATH_FOREACH_PREFIX_MORE(prefix, path) { + _cleanup_free_ char *p = NULL; + Unit *m; + + r = unit_name_from_path(prefix, ".mount", &p); + if (r == -EINVAL) + continue; /* If the path cannot be converted to a mount unit name, then it's + * not manageable as a unit by systemd, and hence we don't need a + * dependency on it. Let's thus silently ignore the issue. */ + if (r < 0) + return r; + + m = manager_get_unit(u->manager, p); + if (!m) { + /* Make sure to load the mount unit if it exists. If so the dependencies on + * this unit will be added later during the loading of the mount unit. */ + (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m); + continue; + } + if (m == u) + continue; + + if (m->load_state != UNIT_LOADED) + continue; + + r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask); + if (r < 0) + return r; + changed = changed || r > 0; + + if (m->fragment_path) { + r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask); + if (r < 0) + return r; + changed = changed || r > 0; + } + } + } + + return changed; +} + +static int unit_add_oomd_dependencies(Unit *u) { + CGroupContext *c; + CGroupMask mask; + int r; + + assert(u); + + if (!u->default_dependencies) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL; + if (!wants_oomd) + return 0; + + if (!cg_all_unified()) + return 0; + + r = cg_mask_supported(&mask); + if (r < 0) + return log_debug_errno(r, "Failed to determine supported controllers: %m"); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return 0; + + return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE); +} + +static int unit_add_startup_units(Unit *u) { + if (!unit_has_startup_cgroup_constraints(u)) + return 0; + + return set_ensure_put(&u->manager->startup_units, NULL, u); +} + +static int unit_validate_on_failure_job_mode( + Unit *u, + const char *job_mode_setting, + JobMode job_mode, + const char *dependency_name, + UnitDependencyAtom atom) { + + Unit *other, *found = NULL; + + if (job_mode != JOB_ISOLATE) + return 0; + + UNIT_FOREACH_DEPENDENCY(other, u, atom) { + if (!found) + found = other; + else if (found != other) + return log_unit_error_errno( + u, SYNTHETIC_ERRNO(ENOEXEC), + "More than one %s dependencies specified but %sisolate set. Refusing.", + dependency_name, job_mode_setting); + } + + return 0; +} + +int unit_load(Unit *u) { + int r; + + assert(u); + + if (u->in_load_queue) { + LIST_REMOVE(load_queue, u->manager->load_queue, u); + u->in_load_queue = false; + } + + if (u->type == _UNIT_TYPE_INVALID) + return -EINVAL; + + if (u->load_state != UNIT_STUB) + return 0; + + if (u->transient_file) { + /* Finalize transient file: if this is a transient unit file, as soon as we reach unit_load() the setup + * is complete, hence let's synchronize the unit file we just wrote to disk. */ + + r = fflush_and_check(u->transient_file); + if (r < 0) + goto fail; + + u->transient_file = safe_fclose(u->transient_file); + u->fragment_mtime = now(CLOCK_REALTIME); + } + + r = UNIT_VTABLE(u)->load(u); + if (r < 0) + goto fail; + + assert(u->load_state != UNIT_STUB); + + if (u->load_state == UNIT_LOADED) { + unit_add_to_target_deps_queue(u); + + r = unit_add_slice_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_mount_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_oomd_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_startup_units(u); + if (r < 0) + goto fail; + + r = unit_validate_on_failure_job_mode(u, "OnSuccessJobMode=", u->on_success_job_mode, "OnSuccess=", UNIT_ATOM_ON_SUCCESS); + if (r < 0) + goto fail; + + r = unit_validate_on_failure_job_mode(u, "OnFailureJobMode=", u->on_failure_job_mode, "OnFailure=", UNIT_ATOM_ON_FAILURE); + if (r < 0) + goto fail; + + if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout) + log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect."); + + /* We finished loading, let's ensure our parents recalculate the members mask */ + unit_invalidate_cgroup_members_masks(u); + } + + assert((u->load_state != UNIT_MERGED) == !u->merged_into); + + unit_add_to_dbus_queue(unit_follow_merge(u)); + unit_add_to_gc_queue(u); + (void) manager_varlink_send_managed_oom_update(u); + + return 0; + +fail: + /* We convert ENOEXEC errors to the UNIT_BAD_SETTING load state here. Configuration parsing code + * should hence return ENOEXEC to ensure units are placed in this state after loading. */ + + u->load_state = u->load_state == UNIT_STUB ? UNIT_NOT_FOUND : + r == -ENOEXEC ? UNIT_BAD_SETTING : + UNIT_ERROR; + u->load_error = r; + + /* Record the timestamp on the cache, so that if the cache gets updated between now and the next time + * an attempt is made to load this unit, we know we need to check again. */ + if (u->load_state == UNIT_NOT_FOUND) + u->fragment_not_found_timestamp_hash = u->manager->unit_cache_timestamp_hash; + + unit_add_to_dbus_queue(u); + unit_add_to_gc_queue(u); + + return log_unit_debug_errno(u, r, "Failed to load configuration: %m"); +} + +_printf_(7, 8) +static int log_unit_internal(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) { + Unit *u = userdata; + va_list ap; + int r; + + if (u && !unit_log_level_test(u, level)) + return -ERRNO_VALUE(error); + + va_start(ap, format); + if (u) + r = log_object_internalv(level, error, file, line, func, + u->manager->unit_log_field, + u->id, + u->manager->invocation_log_field, + u->invocation_id_string, + format, ap); + else + r = log_internalv(level, error, file, line, func, format, ap); + va_end(ap); + + return r; +} + +static bool unit_test_condition(Unit *u) { + _cleanup_strv_free_ char **env = NULL; + int r; + + assert(u); + + dual_timestamp_now(&u->condition_timestamp); + + r = manager_get_effective_environment(u->manager, &env); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to determine effective environment: %m"); + u->condition_result = true; + } else + u->condition_result = condition_test_list( + u->conditions, + env, + condition_type_to_string, + log_unit_internal, + u); + + unit_add_to_dbus_queue(u); + return u->condition_result; +} + +static bool unit_test_assert(Unit *u) { + _cleanup_strv_free_ char **env = NULL; + int r; + + assert(u); + + dual_timestamp_now(&u->assert_timestamp); + + r = manager_get_effective_environment(u->manager, &env); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to determine effective environment: %m"); + u->assert_result = CONDITION_ERROR; + } else + u->assert_result = condition_test_list( + u->asserts, + env, + assert_type_to_string, + log_unit_internal, + u); + + unit_add_to_dbus_queue(u); + return u->assert_result; +} + +void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) { + if (log_get_show_color()) { + if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && strchr(ident, ' ')) + ident = strjoina(ANSI_HIGHLIGHT, u->id, ANSI_NORMAL, " - ", u->description); + else + ident = strjoina(ANSI_HIGHLIGHT, ident, ANSI_NORMAL); + } + + DISABLE_WARNING_FORMAT_NONLITERAL; + manager_status_printf(u->manager, status_type, status, format, ident); + REENABLE_WARNING; +} + +int unit_test_start_limit(Unit *u) { + const char *reason; + + assert(u); + + if (ratelimit_below(&u->start_ratelimit)) { + u->start_limit_hit = false; + return 0; + } + + log_unit_warning(u, "Start request repeated too quickly."); + u->start_limit_hit = true; + + reason = strjoina("unit ", u->id, " failed"); + + emergency_action(u->manager, u->start_limit_action, + EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN, + u->reboot_arg, -1, reason); + + return -ECANCELED; +} + +static bool unit_verify_deps(Unit *u) { + Unit *other; + + assert(u); + + /* Checks whether all BindsTo= dependencies of this unit are fulfilled — if they are also combined + * with After=. We do not check Requires= or Requisite= here as they only should have an effect on + * the job processing, but do not have any effect afterwards. We don't check BindsTo= dependencies + * that are not used in conjunction with After= as for them any such check would make things entirely + * racy. */ + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) { + + if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other)) + continue; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) { + log_unit_notice(u, "Bound to unit %s, but unit isn't active.", other->id); + return false; + } + } + + return true; +} + +/* Errors that aren't really errors: + * -EALREADY: Unit is already started. + * -ECOMM: Condition failed + * -EAGAIN: An operation is already in progress. Retry later. + * + * Errors that are real errors: + * -EBADR: This unit type does not support starting. + * -ECANCELED: Start limit hit, too many requests for now + * -EPROTO: Assert failed + * -EINVAL: Unit not loaded + * -EOPNOTSUPP: Unit type not supported + * -ENOLINK: The necessary dependencies are not fulfilled. + * -ESTALE: This unit has been started before and can't be started a second time + * -ENOENT: This is a triggering unit and unit to trigger is not loaded + */ +int unit_start(Unit *u, ActivationDetails *details) { + UnitActiveState state; + Unit *following; + int r; + + assert(u); + + /* Let's hold off running start jobs for mount units when /proc/self/mountinfo monitor is ratelimited. */ + if (UNIT_VTABLE(u)->subsystem_ratelimited) { + r = UNIT_VTABLE(u)->subsystem_ratelimited(u->manager); + if (r < 0) + return r; + if (r > 0) + return -EAGAIN; + } + + /* If this is already started, then this will succeed. Note that this will even succeed if this unit + * is not startable by the user. This is relied on to detect when we need to wait for units and when + * waiting is finished. */ + state = unit_active_state(u); + if (UNIT_IS_ACTIVE_OR_RELOADING(state)) + return -EALREADY; + if (state == UNIT_MAINTENANCE) + return -EAGAIN; + + /* Units that aren't loaded cannot be started */ + if (u->load_state != UNIT_LOADED) + return -EINVAL; + + /* Refuse starting scope units more than once */ + if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_enter_timestamp)) + return -ESTALE; + + /* If the conditions were unmet, don't do anything at all. If we already are activating this call might + * still be useful to speed up activation in case there is some hold-off time, but we don't want to + * recheck the condition in that case. */ + if (state != UNIT_ACTIVATING && + !unit_test_condition(u)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(ECOMM), "Starting requested but condition not met. Not starting unit."); + + /* If the asserts failed, fail the entire job */ + if (state != UNIT_ACTIVATING && + !unit_test_assert(u)) + return log_unit_notice_errno(u, SYNTHETIC_ERRNO(EPROTO), "Starting requested but asserts failed."); + + /* Units of types that aren't supported cannot be started. Note that we do this test only after the + * condition checks, so that we rather return condition check errors (which are usually not + * considered a true failure) than "not supported" errors (which are considered a failure). + */ + if (!unit_type_supported(u->type)) + return -EOPNOTSUPP; + + /* Let's make sure that the deps really are in order before we start this. Normally the job engine + * should have taken care of this already, but let's check this here again. After all, our + * dependencies might not be in effect anymore, due to a reload or due to an unmet condition. */ + if (!unit_verify_deps(u)) + return -ENOLINK; + + /* Forward to the main object, if we aren't it. */ + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting start request from %s to %s.", u->id, following->id); + return unit_start(following, details); + } + + /* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */ + if (UNIT_VTABLE(u)->can_start) { + r = UNIT_VTABLE(u)->can_start(u); + if (r < 0) + return r; + } + + /* If it is stopped, but we cannot start it, then fail */ + if (!UNIT_VTABLE(u)->start) + return -EBADR; + + /* We don't suppress calls to ->start() here when we are already starting, to allow this request to + * be used as a "hurry up" call, for example when the unit is in some "auto restart" state where it + * waits for a holdoff timer to elapse before it will start again. */ + + unit_add_to_dbus_queue(u); + unit_cgroup_freezer_action(u, FREEZER_THAW); + + if (!u->activation_details) /* Older details object wins */ + u->activation_details = activation_details_ref(details); + + return UNIT_VTABLE(u)->start(u); +} + +bool unit_can_start(Unit *u) { + assert(u); + + if (u->load_state != UNIT_LOADED) + return false; + + if (!unit_type_supported(u->type)) + return false; + + /* Scope units may be started only once */ + if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_exit_timestamp)) + return false; + + return !!UNIT_VTABLE(u)->start; +} + +bool unit_can_isolate(Unit *u) { + assert(u); + + return unit_can_start(u) && + u->allow_isolate; +} + +/* Errors: + * -EBADR: This unit type does not support stopping. + * -EALREADY: Unit is already stopped. + * -EAGAIN: An operation is already in progress. Retry later. + */ +int unit_stop(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + state = unit_active_state(u); + if (UNIT_IS_INACTIVE_OR_FAILED(state)) + return -EALREADY; + + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting stop request from %s to %s.", u->id, following->id); + return unit_stop(following); + } + + if (!UNIT_VTABLE(u)->stop) + return -EBADR; + + unit_add_to_dbus_queue(u); + unit_cgroup_freezer_action(u, FREEZER_THAW); + + return UNIT_VTABLE(u)->stop(u); +} + +bool unit_can_stop(Unit *u) { + assert(u); + + /* Note: if we return true here, it does not mean that the unit may be successfully stopped. + * Extrinsic units follow external state and they may stop following external state changes + * (hence we return true here), but an attempt to do this through the manager will fail. */ + + if (!unit_type_supported(u->type)) + return false; + + if (u->perpetual) + return false; + + return !!UNIT_VTABLE(u)->stop; +} + +/* Errors: + * -EBADR: This unit type does not support reloading. + * -ENOEXEC: Unit is not started. + * -EAGAIN: An operation is already in progress. Retry later. + */ +int unit_reload(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + if (u->load_state != UNIT_LOADED) + return -EINVAL; + + if (!unit_can_reload(u)) + return -EBADR; + + state = unit_active_state(u); + if (state == UNIT_RELOADING) + return -EAGAIN; + + if (state != UNIT_ACTIVE) + return log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "Unit cannot be reloaded because it is inactive."); + + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting reload request from %s to %s.", u->id, following->id); + return unit_reload(following); + } + + unit_add_to_dbus_queue(u); + + if (!UNIT_VTABLE(u)->reload) { + /* Unit doesn't have a reload function, but we need to propagate the reload anyway */ + unit_notify(u, unit_active_state(u), unit_active_state(u), /* reload_success = */ true); + return 0; + } + + unit_cgroup_freezer_action(u, FREEZER_THAW); + + return UNIT_VTABLE(u)->reload(u); +} + +bool unit_can_reload(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->can_reload) + return UNIT_VTABLE(u)->can_reload(u); + + if (unit_has_dependency(u, UNIT_ATOM_PROPAGATES_RELOAD_TO, NULL)) + return true; + + return UNIT_VTABLE(u)->reload; +} + +bool unit_is_unneeded(Unit *u) { + Unit *other; + assert(u); + + if (!u->stop_when_unneeded) + return false; + + /* Don't clean up while the unit is transitioning or is even inactive. */ + if (unit_active_state(u) != UNIT_ACTIVE) + return false; + if (u->job) + return false; + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED) { + /* If a dependent unit has a job queued, is active or transitioning, or is marked for + * restart, then don't clean this one up. */ + + if (other->job) + return false; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + return false; + + if (unit_will_restart(other)) + return false; + } + + return true; +} + +bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit) { + Unit *other; + + assert(u); + + /* Checks if the unit needs to be started because it currently is not running, but some other unit + * that is active declared an Uphold= dependencies on it */ + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) || u->job) { + if (ret_culprit) + *ret_culprit = NULL; + return false; + } + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_START_STEADILY) { + if (other->job) + continue; + + if (UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) { + if (ret_culprit) + *ret_culprit = other; + return true; + } + } + + if (ret_culprit) + *ret_culprit = NULL; + return false; +} + +bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit) { + Unit *other; + + assert(u); + + /* Checks whether this unit is bound to another unit that is inactive, i.e. whether we should stop + * because the other unit is down. */ + + if (unit_active_state(u) != UNIT_ACTIVE || u->job) { + /* Don't clean up while the unit is transitioning or is even inactive. */ + if (ret_culprit) + *ret_culprit = NULL; + return false; + } + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) { + if (other->job) + continue; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + if (ret_culprit) + *ret_culprit = other; + + return true; + } + } + + if (ret_culprit) + *ret_culprit = NULL; + return false; +} + +static void check_unneeded_dependencies(Unit *u) { + Unit *other; + assert(u); + + /* Add all units this unit depends on to the queue that processes StopWhenUnneeded= behaviour. */ + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE) + unit_submit_to_stop_when_unneeded_queue(other); +} + +static void check_uphold_dependencies(Unit *u) { + Unit *other; + assert(u); + + /* Add all units this unit depends on to the queue that processes Uphold= behaviour. */ + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE) + unit_submit_to_start_when_upheld_queue(other); +} + +static void check_bound_by_dependencies(Unit *u) { + Unit *other; + assert(u); + + /* Add all units this unit depends on to the queue that processes BindsTo= stop behaviour. */ + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE) + unit_submit_to_stop_when_bound_queue(other); +} + +static void retroactively_start_dependencies(Unit *u) { + Unit *other; + + assert(u); + assert(UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u))); + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_REPLACE) /* Requires= + BindsTo= */ + if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL); + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_FAIL) /* Wants= */ + if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL); + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_START) /* Conflicts= (and inverse) */ + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); +} + +static void retroactively_stop_dependencies(Unit *u) { + Unit *other; + + assert(u); + assert(UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u))); + + /* Pull down units which are bound to us recursively if enabled */ + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_STOP) /* BoundBy= */ + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); +} + +void unit_start_on_failure( + Unit *u, + const char *dependency_name, + UnitDependencyAtom atom, + JobMode job_mode) { + + int n_jobs = -1; + Unit *other; + int r; + + assert(u); + assert(dependency_name); + assert(IN_SET(atom, UNIT_ATOM_ON_SUCCESS, UNIT_ATOM_ON_FAILURE)); + + /* Act on OnFailure= and OnSuccess= dependencies */ + + UNIT_FOREACH_DEPENDENCY(other, u, atom) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (n_jobs < 0) { + log_unit_info(u, "Triggering %s dependencies.", dependency_name); + n_jobs = 0; + } + + r = manager_add_job(u->manager, JOB_START, other, job_mode, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno( + u, r, "Failed to enqueue %s job, ignoring: %s", + dependency_name, bus_error_message(&error, r)); + n_jobs ++; + } + + if (n_jobs >= 0) + log_unit_debug(u, "Triggering %s dependencies done (%i %s).", + dependency_name, n_jobs, n_jobs == 1 ? "job" : "jobs"); +} + +void unit_trigger_notify(Unit *u) { + Unit *other; + + assert(u); + + UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_TRIGGERED_BY) + if (UNIT_VTABLE(other)->trigger_notify) + UNIT_VTABLE(other)->trigger_notify(other, u); +} + +static int raise_level(int log_level, bool condition_info, bool condition_notice) { + if (condition_notice && log_level > LOG_NOTICE) + return LOG_NOTICE; + if (condition_info && log_level > LOG_INFO) + return LOG_INFO; + return log_level; +} + +static int unit_log_resources(Unit *u) { + struct iovec iovec[1 + 2 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4]; + bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false; + _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL; + int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */ + size_t n_message_parts = 0, n_iovec = 0; + char* message_parts[1 + 2 + 2 + 2 + 1], *t; + nsec_t nsec = NSEC_INFINITY; + uint64_t memory_peak = UINT64_MAX, memory_swap_peak = UINT64_MAX; + int r; + const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES", + [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS", + [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES", + [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS", + }; + const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "IO_METRIC_READ_BYTES", + [CGROUP_IO_WRITE_BYTES] = "IO_METRIC_WRITE_BYTES", + [CGROUP_IO_READ_OPERATIONS] = "IO_METRIC_READ_OPERATIONS", + [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS", + }; + + assert(u); + + /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource + * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced + * information and the complete data in structured fields. */ + + (void) unit_get_cpu_usage(u, &nsec); + if (nsec != NSEC_INFINITY) { + /* Format the CPU time for inclusion in the structured log message */ + if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the CPU time for inclusion in the human language message string */ + t = strjoin("consumed ", FORMAT_TIMESPAN(nsec / NSEC_PER_USEC, USEC_PER_MSEC), " CPU time"); + if (!t) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = t; + + log_level = raise_level(log_level, + nsec > MENTIONWORTHY_CPU_NSEC, + nsec > NOTICEWORTHY_CPU_NSEC); + } + + (void) unit_get_memory_accounting(u, CGROUP_MEMORY_PEAK, &memory_peak); + if (memory_peak != UINT64_MAX) { + /* Format peak memory for inclusion in the structured log message */ + if (asprintf(&t, "MEMORY_PEAK=%" PRIu64, memory_peak) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format peak memory for inclusion in the human language message string */ + t = strjoin(FORMAT_BYTES(memory_peak), " memory peak"); + if (!t) { + r = log_oom(); + goto finish; + } + message_parts[n_message_parts++] = t; + } + + (void) unit_get_memory_accounting(u, CGROUP_MEMORY_SWAP_PEAK, &memory_swap_peak); + if (memory_swap_peak != UINT64_MAX) { + /* Format peak swap memory for inclusion in the structured log message */ + if (asprintf(&t, "MEMORY_SWAP_PEAK=%" PRIu64, memory_swap_peak) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format peak swap memory for inclusion in the human language message string */ + t = strjoin(FORMAT_BYTES(memory_swap_peak), " memory swap peak"); + if (!t) { + r = log_oom(); + goto finish; + } + message_parts[n_message_parts++] = t; + } + + for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) { + uint64_t value = UINT64_MAX; + + assert(io_fields[k]); + + (void) unit_get_io_accounting(u, k, k > 0, &value); + if (value == UINT64_MAX) + continue; + + have_io_accounting = true; + if (value > 0) + any_io = true; + + /* Format IO accounting data for inclusion in the structured log message */ + if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the IO accounting data for inclusion in the human language message string, but only + * for the bytes counters (and not for the operations counters) */ + if (k == CGROUP_IO_READ_BYTES) { + assert(!rr); + rr = strjoin("read ", strna(FORMAT_BYTES(value)), " from disk"); + if (!rr) { + r = log_oom(); + goto finish; + } + } else if (k == CGROUP_IO_WRITE_BYTES) { + assert(!wr); + wr = strjoin("written ", strna(FORMAT_BYTES(value)), " to disk"); + if (!wr) { + r = log_oom(); + goto finish; + } + } + + if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES)) + log_level = raise_level(log_level, + value > MENTIONWORTHY_IO_BYTES, + value > NOTICEWORTHY_IO_BYTES); + } + + if (have_io_accounting) { + if (any_io) { + if (rr) + message_parts[n_message_parts++] = TAKE_PTR(rr); + if (wr) + message_parts[n_message_parts++] = TAKE_PTR(wr); + + } else { + char *k; + + k = strdup("no IO"); + if (!k) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = k; + } + } + + for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + uint64_t value = UINT64_MAX; + + assert(ip_fields[m]); + + (void) unit_get_ip_accounting(u, m, &value); + if (value == UINT64_MAX) + continue; + + have_ip_accounting = true; + if (value > 0) + any_traffic = true; + + /* Format IP accounting data for inclusion in the structured log message */ + if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the IP accounting data for inclusion in the human language message string, but only for the + * bytes counters (and not for the packets counters) */ + if (m == CGROUP_IP_INGRESS_BYTES) { + assert(!igress); + igress = strjoin("received ", strna(FORMAT_BYTES(value)), " IP traffic"); + if (!igress) { + r = log_oom(); + goto finish; + } + } else if (m == CGROUP_IP_EGRESS_BYTES) { + assert(!egress); + egress = strjoin("sent ", strna(FORMAT_BYTES(value)), " IP traffic"); + if (!egress) { + r = log_oom(); + goto finish; + } + } + + if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) + log_level = raise_level(log_level, + value > MENTIONWORTHY_IP_BYTES, + value > NOTICEWORTHY_IP_BYTES); + } + + /* This check is here because it is the earliest point following all possible log_level assignments. If + * log_level is assigned anywhere after this point, move this check. */ + if (!unit_log_level_test(u, log_level)) { + r = 0; + goto finish; + } + + if (have_ip_accounting) { + if (any_traffic) { + if (igress) + message_parts[n_message_parts++] = TAKE_PTR(igress); + if (egress) + message_parts[n_message_parts++] = TAKE_PTR(egress); + + } else { + char *k; + + k = strdup("no IP traffic"); + if (!k) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = k; + } + } + + /* Is there any accounting data available at all? */ + if (n_iovec == 0) { + r = 0; + goto finish; + } + + if (n_message_parts == 0) + t = strjoina("MESSAGE=", u->id, ": Completed."); + else { + _cleanup_free_ char *joined = NULL; + + message_parts[n_message_parts] = NULL; + + joined = strv_join(message_parts, ", "); + if (!joined) { + r = log_oom(); + goto finish; + } + + joined[0] = ascii_toupper(joined[0]); + t = strjoina("MESSAGE=", u->id, ": ", joined, "."); + } + + /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them, + * and hence don't increase n_iovec for them */ + iovec[n_iovec] = IOVEC_MAKE_STRING(t); + iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR); + + t = strjoina(u->manager->unit_log_field, u->id); + iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t); + + t = strjoina(u->manager->invocation_log_field, u->invocation_id_string); + iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t); + + log_unit_struct_iovec(u, log_level, iovec, n_iovec + 4); + r = 0; + +finish: + free_many_charp(message_parts, n_message_parts); + + for (size_t i = 0; i < n_iovec; i++) + free(iovec[i].iov_base); + + return r; + +} + +static void unit_update_on_console(Unit *u) { + bool b; + + assert(u); + + b = unit_needs_console(u); + if (u->on_console == b) + return; + + u->on_console = b; + if (b) + manager_ref_console(u->manager); + else + manager_unref_console(u->manager); +} + +static void unit_emit_audit_start(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->audit_start_message_type <= 0) + return; + + /* Write audit record if we have just finished starting up */ + manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_start_message_type, /* success= */ true); + u->in_audit = true; +} + +static void unit_emit_audit_stop(Unit *u, UnitActiveState state) { + assert(u); + + if (UNIT_VTABLE(u)->audit_start_message_type <= 0) + return; + + if (u->in_audit) { + /* Write audit record if we have just finished shutting down */ + manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_stop_message_type, /* success= */ state == UNIT_INACTIVE); + u->in_audit = false; + } else { + /* Hmm, if there was no start record written write it now, so that we always have a nice pair */ + manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_start_message_type, /* success= */ state == UNIT_INACTIVE); + + if (state == UNIT_INACTIVE) + manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_stop_message_type, /* success= */ true); + } +} + +static bool unit_process_job(Job *j, UnitActiveState ns, bool reload_success) { + bool unexpected = false; + JobResult result; + + assert(j); + + if (j->state == JOB_WAITING) + /* So we reached a different state for this job. Let's see if we can run it now if it failed previously + * due to EAGAIN. */ + job_add_to_run_queue(j); + + /* Let's check whether the unit's new state constitutes a finished job, or maybe contradicts a running job and + * hence needs to invalidate jobs. */ + + switch (j->type) { + + case JOB_START: + case JOB_VERIFY_ACTIVE: + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) + job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (j->state == JOB_RUNNING && ns != UNIT_ACTIVATING) { + unexpected = true; + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) { + if (ns == UNIT_FAILED) + result = JOB_FAILED; + else + result = JOB_DONE; + + job_finish_and_invalidate(j, result, true, false); + } + } + + break; + + case JOB_RELOAD: + case JOB_RELOAD_OR_START: + case JOB_TRY_RELOAD: + + if (j->state == JOB_RUNNING) { + if (ns == UNIT_ACTIVE) + job_finish_and_invalidate(j, reload_success ? JOB_DONE : JOB_FAILED, true, false); + else if (!IN_SET(ns, UNIT_ACTIVATING, UNIT_RELOADING)) { + unexpected = true; + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false); + } + } + + break; + + case JOB_STOP: + case JOB_RESTART: + case JOB_TRY_RESTART: + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (j->state == JOB_RUNNING && ns != UNIT_DEACTIVATING) { + unexpected = true; + job_finish_and_invalidate(j, JOB_FAILED, true, false); + } + + break; + + default: + assert_not_reached(); + } + + return unexpected; +} + +void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success) { + const char *reason; + Manager *m; + + assert(u); + assert(os < _UNIT_ACTIVE_STATE_MAX); + assert(ns < _UNIT_ACTIVE_STATE_MAX); + + /* Note that this is called for all low-level state changes, even if they might map to the same high-level + * UnitActiveState! That means that ns == os is an expected behavior here. For example: if a mount point is + * remounted this function will be called too! */ + + m = u->manager; + + /* Let's enqueue the change signal early. In case this unit has a job associated we want that this unit is in + * the bus queue, so that any job change signal queued will force out the unit change signal first. */ + unit_add_to_dbus_queue(u); + + /* Update systemd-oomd on the property/state change */ + if (os != ns) { + /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop + * monitoring. + * Also send an update whenever the unit goes active; this is to handle a case where an override file + * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to + * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't + * have the information on the property. Thus, indiscriminately send an update. */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns) || UNIT_IS_ACTIVE_OR_RELOADING(ns)) + (void) manager_varlink_send_managed_oom_update(u); + } + + /* Update timestamps for state changes */ + if (!MANAGER_IS_RELOADING(m)) { + dual_timestamp_now(&u->state_change_timestamp); + + if (UNIT_IS_INACTIVE_OR_FAILED(os) && !UNIT_IS_INACTIVE_OR_FAILED(ns)) + u->inactive_exit_timestamp = u->state_change_timestamp; + else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_INACTIVE_OR_FAILED(ns)) + u->inactive_enter_timestamp = u->state_change_timestamp; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(os) && UNIT_IS_ACTIVE_OR_RELOADING(ns)) + u->active_enter_timestamp = u->state_change_timestamp; + else if (UNIT_IS_ACTIVE_OR_RELOADING(os) && !UNIT_IS_ACTIVE_OR_RELOADING(ns)) + u->active_exit_timestamp = u->state_change_timestamp; + } + + /* Keep track of failed units */ + (void) manager_update_failed_units(m, u, ns == UNIT_FAILED); + + /* Make sure the cgroup and state files are always removed when we become inactive */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) { + SET_FLAG(u->markers, + (1u << UNIT_MARKER_NEEDS_RELOAD)|(1u << UNIT_MARKER_NEEDS_RESTART), + false); + unit_prune_cgroup(u); + unit_unlink_state_files(u); + } else if (ns != os && ns == UNIT_RELOADING) + SET_FLAG(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD, false); + + unit_update_on_console(u); + + if (!MANAGER_IS_RELOADING(m)) { + bool unexpected; + + /* Let's propagate state changes to the job */ + if (u->job) + unexpected = unit_process_job(u->job, ns, reload_success); + else + unexpected = true; + + /* If this state change happened without being requested by a job, then let's retroactively start or + * stop dependencies. We skip that step when deserializing, since we don't want to create any + * additional jobs just because something is already activated. */ + + if (unexpected) { + if (UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_ACTIVE_OR_ACTIVATING(ns)) + retroactively_start_dependencies(u); + else if (UNIT_IS_ACTIVE_OR_ACTIVATING(os) && UNIT_IS_INACTIVE_OR_DEACTIVATING(ns)) + retroactively_stop_dependencies(u); + } + + if (ns != os && ns == UNIT_FAILED) { + log_unit_debug(u, "Unit entered failed state."); + unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode); + } + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns) && !UNIT_IS_ACTIVE_OR_RELOADING(os)) { + /* This unit just finished starting up */ + + unit_emit_audit_start(u); + manager_send_unit_plymouth(m, u); + } + + if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) { + /* This unit just stopped/failed. */ + + unit_emit_audit_stop(u, ns); + unit_log_resources(u); + } + + if (ns == UNIT_INACTIVE && !IN_SET(os, UNIT_FAILED, UNIT_INACTIVE, UNIT_MAINTENANCE)) + unit_start_on_failure(u, "OnSuccess=", UNIT_ATOM_ON_SUCCESS, u->on_success_job_mode); + } + + manager_recheck_journal(m); + manager_recheck_dbus(m); + + unit_trigger_notify(u); + + if (!MANAGER_IS_RELOADING(m)) { + if (os != UNIT_FAILED && ns == UNIT_FAILED) { + reason = strjoina("unit ", u->id, " failed"); + emergency_action(m, u->failure_action, 0, u->reboot_arg, unit_failure_action_exit_status(u), reason); + } else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && ns == UNIT_INACTIVE) { + reason = strjoina("unit ", u->id, " succeeded"); + emergency_action(m, u->success_action, 0, u->reboot_arg, unit_success_action_exit_status(u), reason); + } + } + + /* And now, add the unit or depending units to various queues that will act on the new situation if + * needed. These queues generally check for continuous state changes rather than events (like most of + * the state propagation above), and do work deferred instead of instantly, since they typically + * don't want to run during reloading, and usually involve checking combined state of multiple units + * at once. */ + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) { + /* Stop unneeded units and bound-by units regardless if going down was expected or not */ + check_unneeded_dependencies(u); + check_bound_by_dependencies(u); + + /* Maybe someone wants us to remain up? */ + unit_submit_to_start_when_upheld_queue(u); + + /* Maybe the unit should be GC'ed now? */ + unit_add_to_gc_queue(u); + + /* Maybe we can release some resources now? */ + unit_submit_to_release_resources_queue(u); + } + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) { + /* Start uphold units regardless if going up was expected or not */ + check_uphold_dependencies(u); + + /* Maybe we finished startup and are now ready for being stopped because unneeded? */ + unit_submit_to_stop_when_unneeded_queue(u); + + /* Maybe we finished startup, but something we needed has vanished? Let's die then. (This happens + * when something BindsTo= to a Type=oneshot unit, as these units go directly from starting to + * inactive, without ever entering started.) */ + unit_submit_to_stop_when_bound_queue(u); + } +} + +int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive) { + _cleanup_(pidref_freep) PidRef *pid_dup = NULL; + int r; + + /* Adds a specific PID to the set of PIDs this unit watches. */ + + assert(u); + assert(pidref_is_set(pid)); + + /* Caller might be sure that this PID belongs to this unit only. Let's take this + * opportunity to remove any stalled references to this PID as they can be created + * easily (when watching a process which is not our direct child). */ + if (exclusive) + manager_unwatch_pidref(u->manager, pid); + + if (set_contains(u->pids, pid)) /* early exit if already being watched */ + return 0; + + r = pidref_dup(pid, &pid_dup); + if (r < 0) + return r; + + /* First, insert into the set of PIDs maintained by the unit */ + r = set_ensure_put(&u->pids, &pidref_hash_ops_free, pid_dup); + if (r < 0) + return r; + + pid = TAKE_PTR(pid_dup); /* continue with our copy now that we have installed it properly in our set */ + + /* Second, insert it into the simple global table, see if that works */ + r = hashmap_ensure_put(&u->manager->watch_pids, &pidref_hash_ops_free, pid, u); + if (r != -EEXIST) + return r; + + /* OK, the key is already assigned to a different unit. That's fine, then add us via the second + * hashmap that points to an array. */ + + PidRef *old_pid = NULL; + Unit **array = hashmap_get2(u->manager->watch_pids_more, pid, (void**) &old_pid); + + /* Count entries in array */ + size_t n = 0; + for (; array && array[n]; n++) + ; + + /* Allocate a new array */ + _cleanup_free_ Unit **new_array = new(Unit*, n + 2); + if (!new_array) + return -ENOMEM; + + /* Append us to the end */ + memcpy_safe(new_array, array, sizeof(Unit*) * n); + new_array[n] = u; + new_array[n+1] = NULL; + + /* Make sure the hashmap is allocated */ + r = hashmap_ensure_allocated(&u->manager->watch_pids_more, &pidref_hash_ops_free); + if (r < 0) + return r; + + /* Add or replace the old array */ + r = hashmap_replace(u->manager->watch_pids_more, old_pid ?: pid, new_array); + if (r < 0) + return r; + + TAKE_PTR(new_array); /* Now part of the hash table */ + free(array); /* Which means we can now delete the old version */ + return 0; +} + +int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + int r; + + assert(u); + assert(pid_is_valid(pid)); + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return r; + + return unit_watch_pidref(u, &pidref, exclusive); +} + +void unit_unwatch_pidref(Unit *u, PidRef *pid) { + assert(u); + assert(pidref_is_set(pid)); + + /* Remove from the set we maintain for this unit. (And destroy the returned pid eventually) */ + _cleanup_(pidref_freep) PidRef *pid1 = set_remove(u->pids, pid); + if (!pid1) + return; /* Early exit if this PID was never watched by us */ + + /* First let's drop the unit from the simple hash table, if it is included there */ + PidRef *pid2 = NULL; + Unit *uu = hashmap_get2(u->manager->watch_pids, pid, (void**) &pid2); + + /* Quick validation: iff we are in the watch_pids table then the PidRef object must be the same as in our local pids set */ + assert((uu == u) == (pid1 == pid2)); + + if (uu == u) + /* OK, we are in the first table. Let's remove it there then, and we are done already. */ + assert_se(hashmap_remove_value(u->manager->watch_pids, pid2, uu)); + else { + /* We weren't in the first table, then let's consult the 2nd table that points to an array */ + PidRef *pid3 = NULL; + Unit **array = hashmap_get2(u->manager->watch_pids_more, pid, (void**) &pid3); + + /* Let's iterate through the array, dropping our own entry */ + size_t m = 0, n = 0; + for (; array && array[n]; n++) + if (array[n] != u) + array[m++] = array[n]; + if (n == m) + return; /* Not there */ + + array[m] = NULL; /* set trailing NULL marker on the new end */ + + if (m == 0) { + /* The array is now empty, remove the entire entry */ + assert_se(hashmap_remove_value(u->manager->watch_pids_more, pid3, array)); + free(array); + } else { + /* The array is not empty, but let's make sure the entry is not keyed by the PidRef + * we will delete, but by the PidRef object of the Unit that is now first in the + * array. */ + + PidRef *new_pid3 = ASSERT_PTR(set_get(array[0]->pids, pid)); + assert_se(hashmap_replace(u->manager->watch_pids_more, new_pid3, array) >= 0); + } + } +} + +void unit_unwatch_pid(Unit *u, pid_t pid) { + return unit_unwatch_pidref(u, &PIDREF_MAKE_FROM_PID(pid)); +} + +void unit_unwatch_all_pids(Unit *u) { + assert(u); + + while (!set_isempty(u->pids)) + unit_unwatch_pidref(u, set_first(u->pids)); + + u->pids = set_free(u->pids); +} + +static void unit_tidy_watch_pids(Unit *u) { + PidRef *except1, *except2, *e; + + assert(u); + + /* Cleans dead PIDs from our list */ + + except1 = unit_main_pid(u); + except2 = unit_control_pid(u); + + SET_FOREACH(e, u->pids) { + if (pidref_equal(except1, e) || pidref_equal(except2, e)) + continue; + + if (pidref_is_unwaited(e) <= 0) + unit_unwatch_pidref(u, e); + } +} + +static int on_rewatch_pids_event(sd_event_source *s, void *userdata) { + Unit *u = ASSERT_PTR(userdata); + + assert(s); + + unit_tidy_watch_pids(u); + unit_watch_all_pids(u); + + /* If the PID set is empty now, then let's finish this off. */ + unit_synthesize_cgroup_empty_event(u); + + return 0; +} + +int unit_enqueue_rewatch_pids(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new + * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it + * involves issuing kill(pid, 0) on all processes we watch. */ + + if (!u->rewatch_pids_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m"); + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m"); + + (void) sd_event_source_set_description(s, "tidy-watch-pids"); + + u->rewatch_pids_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m"); + + return 0; +} + +void unit_dequeue_rewatch_pids(Unit *u) { + int r; + assert(u); + + if (!u->rewatch_pids_event_source) + return; + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF); + if (r < 0) + log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m"); + + u->rewatch_pids_event_source = sd_event_source_disable_unref(u->rewatch_pids_event_source); +} + +bool unit_job_is_applicable(Unit *u, JobType j) { + assert(u); + assert(j >= 0 && j < _JOB_TYPE_MAX); + + switch (j) { + + case JOB_VERIFY_ACTIVE: + case JOB_START: + case JOB_NOP: + /* Note that we don't check unit_can_start() here. That's because .device units and suchlike are not + * startable by us but may appear due to external events, and it thus makes sense to permit enqueuing + * jobs for it. */ + return true; + + case JOB_STOP: + /* Similar as above. However, perpetual units can never be stopped (neither explicitly nor due to + * external events), hence it makes no sense to permit enqueuing such a request either. */ + return !u->perpetual; + + case JOB_RESTART: + case JOB_TRY_RESTART: + return unit_can_stop(u) && unit_can_start(u); + + case JOB_RELOAD: + case JOB_TRY_RELOAD: + return unit_can_reload(u); + + case JOB_RELOAD_OR_START: + return unit_can_reload(u) && unit_can_start(u); + + default: + assert_not_reached(); + } +} + +static Hashmap *unit_get_dependency_hashmap_per_type(Unit *u, UnitDependency d) { + Hashmap *deps; + + assert(u); + assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX); + + deps = hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d)); + if (!deps) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + + h = hashmap_new(NULL); + if (!h) + return NULL; + + if (hashmap_ensure_put(&u->dependencies, NULL, UNIT_DEPENDENCY_TO_PTR(d), h) < 0) + return NULL; + + deps = TAKE_PTR(h); + } + + return deps; +} + +typedef enum NotifyDependencyFlags { + NOTIFY_DEPENDENCY_UPDATE_FROM = 1 << 0, + NOTIFY_DEPENDENCY_UPDATE_TO = 1 << 1, +} NotifyDependencyFlags; + +static int unit_add_dependency_impl( + Unit *u, + UnitDependency d, + Unit *other, + UnitDependencyMask mask) { + + static const UnitDependency inverse_table[_UNIT_DEPENDENCY_MAX] = { + [UNIT_REQUIRES] = UNIT_REQUIRED_BY, + [UNIT_REQUISITE] = UNIT_REQUISITE_OF, + [UNIT_WANTS] = UNIT_WANTED_BY, + [UNIT_BINDS_TO] = UNIT_BOUND_BY, + [UNIT_PART_OF] = UNIT_CONSISTS_OF, + [UNIT_UPHOLDS] = UNIT_UPHELD_BY, + [UNIT_REQUIRED_BY] = UNIT_REQUIRES, + [UNIT_REQUISITE_OF] = UNIT_REQUISITE, + [UNIT_WANTED_BY] = UNIT_WANTS, + [UNIT_BOUND_BY] = UNIT_BINDS_TO, + [UNIT_CONSISTS_OF] = UNIT_PART_OF, + [UNIT_UPHELD_BY] = UNIT_UPHOLDS, + [UNIT_CONFLICTS] = UNIT_CONFLICTED_BY, + [UNIT_CONFLICTED_BY] = UNIT_CONFLICTS, + [UNIT_BEFORE] = UNIT_AFTER, + [UNIT_AFTER] = UNIT_BEFORE, + [UNIT_ON_SUCCESS] = UNIT_ON_SUCCESS_OF, + [UNIT_ON_SUCCESS_OF] = UNIT_ON_SUCCESS, + [UNIT_ON_FAILURE] = UNIT_ON_FAILURE_OF, + [UNIT_ON_FAILURE_OF] = UNIT_ON_FAILURE, + [UNIT_TRIGGERS] = UNIT_TRIGGERED_BY, + [UNIT_TRIGGERED_BY] = UNIT_TRIGGERS, + [UNIT_PROPAGATES_RELOAD_TO] = UNIT_RELOAD_PROPAGATED_FROM, + [UNIT_RELOAD_PROPAGATED_FROM] = UNIT_PROPAGATES_RELOAD_TO, + [UNIT_PROPAGATES_STOP_TO] = UNIT_STOP_PROPAGATED_FROM, + [UNIT_STOP_PROPAGATED_FROM] = UNIT_PROPAGATES_STOP_TO, + [UNIT_JOINS_NAMESPACE_OF] = UNIT_JOINS_NAMESPACE_OF, /* symmetric! 👓 */ + [UNIT_REFERENCES] = UNIT_REFERENCED_BY, + [UNIT_REFERENCED_BY] = UNIT_REFERENCES, + [UNIT_IN_SLICE] = UNIT_SLICE_OF, + [UNIT_SLICE_OF] = UNIT_IN_SLICE, + }; + + Hashmap *u_deps, *other_deps; + UnitDependencyInfo u_info, u_info_old, other_info, other_info_old; + NotifyDependencyFlags flags = 0; + int r; + + assert(u); + assert(other); + assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX); + assert(inverse_table[d] >= 0 && inverse_table[d] < _UNIT_DEPENDENCY_MAX); + assert(mask > 0 && mask < _UNIT_DEPENDENCY_MASK_FULL); + + /* Ensure the following two hashmaps for each unit exist: + * - the top-level dependency hashmap that maps UnitDependency → Hashmap(Unit* → UnitDependencyInfo), + * - the inner hashmap, that maps Unit* → UnitDependencyInfo, for the specified dependency type. */ + u_deps = unit_get_dependency_hashmap_per_type(u, d); + if (!u_deps) + return -ENOMEM; + + other_deps = unit_get_dependency_hashmap_per_type(other, inverse_table[d]); + if (!other_deps) + return -ENOMEM; + + /* Save the original dependency info. */ + u_info.data = u_info_old.data = hashmap_get(u_deps, other); + other_info.data = other_info_old.data = hashmap_get(other_deps, u); + + /* Update dependency info. */ + u_info.origin_mask |= mask; + other_info.destination_mask |= mask; + + /* Save updated dependency info. */ + if (u_info.data != u_info_old.data) { + r = hashmap_replace(u_deps, other, u_info.data); + if (r < 0) + return r; + + flags = NOTIFY_DEPENDENCY_UPDATE_FROM; + } + + if (other_info.data != other_info_old.data) { + r = hashmap_replace(other_deps, u, other_info.data); + if (r < 0) { + if (u_info.data != u_info_old.data) { + /* Restore the old dependency. */ + if (u_info_old.data) + (void) hashmap_update(u_deps, other, u_info_old.data); + else + hashmap_remove(u_deps, other); + } + return r; + } + + flags |= NOTIFY_DEPENDENCY_UPDATE_TO; + } + + return flags; +} + +int unit_add_dependency( + Unit *u, + UnitDependency d, + Unit *other, + bool add_reference, + UnitDependencyMask mask) { + + UnitDependencyAtom a; + int r; + + /* Helper to know whether sending a notification is necessary or not: if the dependency is already + * there, no need to notify! */ + NotifyDependencyFlags notify_flags; + + assert(u); + assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX); + assert(other); + + u = unit_follow_merge(u); + other = unit_follow_merge(other); + a = unit_dependency_to_atom(d); + assert(a >= 0); + + /* We won't allow dependencies on ourselves. We will not consider them an error however. */ + if (u == other) { + if (unit_should_warn_about_dependency(d)) + log_unit_warning(u, "Dependency %s=%s is dropped.", + unit_dependency_to_string(d), u->id); + return 0; + } + + if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES)) + return 0; + + /* Note that ordering a device unit after a unit is permitted since it allows to start its job + * running timeout at a specific time. */ + if (FLAGS_SET(a, UNIT_ATOM_BEFORE) && other->type == UNIT_DEVICE) { + log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id); + return 0; + } + + if (FLAGS_SET(a, UNIT_ATOM_ON_FAILURE) && !UNIT_VTABLE(u)->can_fail) { + log_unit_warning(u, "Requested dependency OnFailure=%s ignored (%s units cannot fail).", other->id, unit_type_to_string(u->type)); + return 0; + } + + if (FLAGS_SET(a, UNIT_ATOM_TRIGGERS) && !UNIT_VTABLE(u)->can_trigger) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency Triggers=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(u->type)); + if (FLAGS_SET(a, UNIT_ATOM_TRIGGERED_BY) && !UNIT_VTABLE(other)->can_trigger) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency TriggeredBy=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(other->type)); + + if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && other->type != UNIT_SLICE) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency Slice=%s refused (%s is not a slice unit).", other->id, other->id); + if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && u->type != UNIT_SLICE) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency SliceOf=%s refused (%s is not a slice unit).", other->id, u->id); + + if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && !UNIT_HAS_CGROUP_CONTEXT(u)) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency Slice=%s refused (%s is not a cgroup unit).", other->id, u->id); + + if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && !UNIT_HAS_CGROUP_CONTEXT(other)) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency SliceOf=%s refused (%s is not a cgroup unit).", other->id, other->id); + + r = unit_add_dependency_impl(u, d, other, mask); + if (r < 0) + return r; + notify_flags = r; + + if (add_reference) { + r = unit_add_dependency_impl(u, UNIT_REFERENCES, other, mask); + if (r < 0) + return r; + notify_flags |= r; + } + + if (FLAGS_SET(notify_flags, NOTIFY_DEPENDENCY_UPDATE_FROM)) + unit_add_to_dbus_queue(u); + if (FLAGS_SET(notify_flags, NOTIFY_DEPENDENCY_UPDATE_TO)) + unit_add_to_dbus_queue(other); + + return notify_flags != 0; +} + +int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask) { + int r = 0, s = 0; + + assert(u); + assert(d >= 0 || e >= 0); + + if (d >= 0) { + r = unit_add_dependency(u, d, other, add_reference, mask); + if (r < 0) + return r; + } + + if (e >= 0) { + s = unit_add_dependency(u, e, other, add_reference, mask); + if (s < 0) + return s; + } + + return r > 0 || s > 0; +} + +static int resolve_template(Unit *u, const char *name, char **buf, const char **ret) { + int r; + + assert(u); + assert(name); + assert(buf); + assert(ret); + + if (!unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + *buf = NULL; + *ret = name; + return 0; + } + + if (u->instance) + r = unit_name_replace_instance(name, u->instance, buf); + else { + _cleanup_free_ char *i = NULL; + + r = unit_name_to_prefix(u->id, &i); + if (r < 0) + return r; + + r = unit_name_replace_instance(name, i, buf); + } + if (r < 0) + return r; + + *ret = *buf; + return 0; +} + +int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask) { + _cleanup_free_ char *buf = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + r = resolve_template(u, name, &buf, &name); + if (r < 0) + return r; + + if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES)) + return 0; + + r = manager_load_unit(u->manager, name, NULL, NULL, &other); + if (r < 0) + return r; + + return unit_add_dependency(u, d, other, add_reference, mask); +} + +int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask) { + _cleanup_free_ char *buf = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + r = resolve_template(u, name, &buf, &name); + if (r < 0) + return r; + + if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES)) + return 0; + + r = manager_load_unit(u->manager, name, NULL, NULL, &other); + if (r < 0) + return r; + + return unit_add_two_dependencies(u, d, e, other, add_reference, mask); +} + +int set_unit_path(const char *p) { + /* This is mostly for debug purposes */ + return RET_NERRNO(setenv("SYSTEMD_UNIT_PATH", p, 1)); +} + +char *unit_dbus_path(Unit *u) { + assert(u); + + if (!u->id) + return NULL; + + return unit_dbus_path_from_name(u->id); +} + +char *unit_dbus_path_invocation_id(Unit *u) { + assert(u); + + if (sd_id128_is_null(u->invocation_id)) + return NULL; + + return unit_dbus_path_from_name(u->invocation_id_string); +} + +int unit_set_invocation_id(Unit *u, sd_id128_t id) { + int r; + + assert(u); + + /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */ + + if (sd_id128_equal(u->invocation_id, id)) + return 0; + + if (!sd_id128_is_null(u->invocation_id)) + (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (sd_id128_is_null(id)) { + r = 0; + goto reset; + } + + r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops); + if (r < 0) + goto reset; + + u->invocation_id = id; + sd_id128_to_string(id, u->invocation_id_string); + + r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u); + if (r < 0) + goto reset; + + return 0; + +reset: + u->invocation_id = SD_ID128_NULL; + u->invocation_id_string[0] = 0; + return r; +} + +int unit_set_slice(Unit *u, Unit *slice) { + int r; + + assert(u); + assert(slice); + + /* Sets the unit slice if it has not been set before. Is extra careful, to only allow this for units + * that actually have a cgroup context. Also, we don't allow to set this for slices (since the parent + * slice is derived from the name). Make sure the unit we set is actually a slice. */ + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EOPNOTSUPP; + + if (u->type == UNIT_SLICE) + return -EINVAL; + + if (unit_active_state(u) != UNIT_INACTIVE) + return -EBUSY; + + if (slice->type != UNIT_SLICE) + return -EINVAL; + + if (unit_has_name(u, SPECIAL_INIT_SCOPE) && + !unit_has_name(slice, SPECIAL_ROOT_SLICE)) + return -EPERM; + + if (UNIT_GET_SLICE(u) == slice) + return 0; + + /* Disallow slice changes if @u is already bound to cgroups */ + if (UNIT_GET_SLICE(u) && u->cgroup_realized) + return -EBUSY; + + /* Remove any slices assigned prior; we should only have one UNIT_IN_SLICE dependency */ + if (UNIT_GET_SLICE(u)) + unit_remove_dependencies(u, UNIT_DEPENDENCY_SLICE_PROPERTY); + + r = unit_add_dependency(u, UNIT_IN_SLICE, slice, true, UNIT_DEPENDENCY_SLICE_PROPERTY); + if (r < 0) + return r; + + return 1; +} + +int unit_set_default_slice(Unit *u) { + const char *slice_name; + Unit *slice; + int r; + + assert(u); + + if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES)) + return 0; + + if (UNIT_GET_SLICE(u)) + return 0; + + if (u->instance) { + _cleanup_free_ char *prefix = NULL, *escaped = NULL; + + /* Implicitly place all instantiated units in their + * own per-template slice */ + + r = unit_name_to_prefix(u->id, &prefix); + if (r < 0) + return r; + + /* The prefix is already escaped, but it might include + * "-" which has a special meaning for slice units, + * hence escape it here extra. */ + escaped = unit_name_escape(prefix); + if (!escaped) + return -ENOMEM; + + if (MANAGER_IS_SYSTEM(u->manager)) + slice_name = strjoina("system-", escaped, ".slice"); + else + slice_name = strjoina("app-", escaped, ".slice"); + + } else if (unit_is_extrinsic(u)) + /* Keep all extrinsic units (e.g. perpetual units and swap and mount units in user mode) in + * the root slice. They don't really belong in one of the subslices. */ + slice_name = SPECIAL_ROOT_SLICE; + + else if (MANAGER_IS_SYSTEM(u->manager)) + slice_name = SPECIAL_SYSTEM_SLICE; + else + slice_name = SPECIAL_APP_SLICE; + + r = manager_load_unit(u->manager, slice_name, NULL, NULL, &slice); + if (r < 0) + return r; + + return unit_set_slice(u, slice); +} + +const char *unit_slice_name(Unit *u) { + Unit *slice; + assert(u); + + slice = UNIT_GET_SLICE(u); + if (!slice) + return NULL; + + return slice->id; +} + +int unit_load_related_unit(Unit *u, const char *type, Unit **_found) { + _cleanup_free_ char *t = NULL; + int r; + + assert(u); + assert(type); + assert(_found); + + r = unit_name_change_suffix(u->id, type, &t); + if (r < 0) + return r; + if (unit_has_name(u, t)) + return -EINVAL; + + r = manager_load_unit(u->manager, t, NULL, NULL, _found); + assert(r < 0 || *_found != u); + return r; +} + +static int signal_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *new_owner; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = sd_bus_message_read(message, "sss", NULL, NULL, &new_owner); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (UNIT_VTABLE(u)->bus_name_owner_change) + UNIT_VTABLE(u)->bus_name_owner_change(u, empty_to_null(new_owner)); + + return 0; +} + +static int get_name_owner_handler(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const sd_bus_error *e; + const char *new_owner; + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); + + e = sd_bus_message_get_error(message); + if (e) { + if (!sd_bus_error_has_name(e, SD_BUS_ERROR_NAME_HAS_NO_OWNER)) { + r = sd_bus_error_get_errno(e); + log_unit_error_errno(u, r, + "Unexpected error response from GetNameOwner(): %s", + bus_error_message(e, r)); + } + + new_owner = NULL; + } else { + r = sd_bus_message_read(message, "s", &new_owner); + if (r < 0) + return bus_log_parse_error(r); + + assert(!isempty(new_owner)); + } + + if (UNIT_VTABLE(u)->bus_name_owner_change) + UNIT_VTABLE(u)->bus_name_owner_change(u, new_owner); + + return 0; +} + +int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + const char *match; + usec_t timeout_usec = 0; + int r; + + assert(u); + assert(bus); + assert(name); + + if (u->match_bus_slot || u->get_name_owner_slot) + return -EBUSY; + + /* NameOwnerChanged and GetNameOwner is used to detect when a service finished starting up. The dbus + * call timeout shouldn't be earlier than that. If we couldn't get the start timeout, use the default + * value defined above. */ + if (UNIT_VTABLE(u)->get_timeout_start_usec) + timeout_usec = UNIT_VTABLE(u)->get_timeout_start_usec(u); + + match = strjoina("type='signal'," + "sender='org.freedesktop.DBus'," + "path='/org/freedesktop/DBus'," + "interface='org.freedesktop.DBus'," + "member='NameOwnerChanged'," + "arg0='", name, "'"); + + r = bus_add_match_full( + bus, + &u->match_bus_slot, + true, + match, + signal_name_owner_changed, + NULL, + u, + timeout_usec); + if (r < 0) + return r; + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetNameOwner"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "s", name); + if (r < 0) + return r; + + r = sd_bus_call_async( + bus, + &u->get_name_owner_slot, + m, + get_name_owner_handler, + u, + timeout_usec); + + if (r < 0) { + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + return r; + } + + log_unit_debug(u, "Watching D-Bus name '%s'.", name); + return 0; +} + +int unit_watch_bus_name(Unit *u, const char *name) { + int r; + + assert(u); + assert(name); + + /* Watch a specific name on the bus. We only support one unit + * watching each name for now. */ + + if (u->manager->api_bus) { + /* If the bus is already available, install the match directly. + * Otherwise, just put the name in the list. bus_setup_api() will take care later. */ + r = unit_install_bus_match(u, u->manager->api_bus, name); + if (r < 0) + return log_warning_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name); + } + + r = hashmap_put(u->manager->watch_bus, name, u); + if (r < 0) { + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); + return log_warning_errno(r, "Failed to put bus name to hashmap: %m"); + } + + return 0; +} + +void unit_unwatch_bus_name(Unit *u, const char *name) { + assert(u); + assert(name); + + (void) hashmap_remove_value(u->manager->watch_bus, name, u); + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); +} + +int unit_add_node_dependency(Unit *u, const char *what, UnitDependency dep, UnitDependencyMask mask) { + _cleanup_free_ char *e = NULL; + Unit *device; + int r; + + assert(u); + + /* Adds in links to the device node that this unit is based on */ + if (isempty(what)) + return 0; + + if (!is_device_path(what)) + return 0; + + /* When device units aren't supported (such as in a container), don't create dependencies on them. */ + if (!unit_type_supported(UNIT_DEVICE)) + return 0; + + r = unit_name_from_path(what, ".device", &e); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, e, NULL, NULL, &device); + if (r < 0) + return r; + + if (dep == UNIT_REQUIRES && device_shall_be_bound_by(device, u)) + dep = UNIT_BINDS_TO; + + return unit_add_two_dependencies(u, UNIT_AFTER, + MANAGER_IS_SYSTEM(u->manager) ? dep : UNIT_WANTS, + device, true, mask); +} + +int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask) { + _cleanup_free_ char *escaped = NULL, *target = NULL; + int r; + + assert(u); + + if (isempty(what)) + return 0; + + if (!path_startswith(what, "/dev/")) + return 0; + + /* If we don't support devices, then also don't bother with blockdev@.target */ + if (!unit_type_supported(UNIT_DEVICE)) + return 0; + + r = unit_name_path_escape(what, &escaped); + if (r < 0) + return r; + + r = unit_name_build("blockdev", escaped, ".target", &target); + if (r < 0) + return r; + + return unit_add_dependency_by_name(u, UNIT_AFTER, target, true, mask); +} + +int unit_coldplug(Unit *u) { + int r = 0; + + assert(u); + + /* Make sure we don't enter a loop, when coldplugging recursively. */ + if (u->coldplugged) + return 0; + + u->coldplugged = true; + + STRV_FOREACH(i, u->deserialized_refs) + RET_GATHER(r, bus_unit_track_add_name(u, *i)); + + u->deserialized_refs = strv_free(u->deserialized_refs); + + if (UNIT_VTABLE(u)->coldplug) + RET_GATHER(r, UNIT_VTABLE(u)->coldplug(u)); + + if (u->job) + RET_GATHER(r, job_coldplug(u->job)); + if (u->nop_job) + RET_GATHER(r, job_coldplug(u->nop_job)); + + unit_modify_nft_set(u, /* add = */ true); + return r; +} + +void unit_catchup(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->catchup) + UNIT_VTABLE(u)->catchup(u); + + unit_cgroup_catchup(u); +} + +static bool fragment_mtime_newer(const char *path, usec_t mtime, bool path_masked) { + struct stat st; + + if (!path) + return false; + + /* If the source is some virtual kernel file system, then we assume we watch it anyway, and hence pretend we + * are never out-of-date. */ + if (PATH_STARTSWITH_SET(path, "/proc", "/sys")) + return false; + + if (stat(path, &st) < 0) + /* What, cannot access this anymore? */ + return true; + + if (path_masked) + /* For masked files check if they are still so */ + return !null_or_empty(&st); + else + /* For non-empty files check the mtime */ + return timespec_load(&st.st_mtim) > mtime; + + return false; +} + +bool unit_need_daemon_reload(Unit *u) { + _cleanup_strv_free_ char **dropins = NULL; + + assert(u); + assert(u->manager); + + if (u->manager->unit_file_state_outdated) + return true; + + /* For unit files, we allow masking… */ + if (fragment_mtime_newer(u->fragment_path, u->fragment_mtime, + u->load_state == UNIT_MASKED)) + return true; + + /* Source paths should not be masked… */ + if (fragment_mtime_newer(u->source_path, u->source_mtime, false)) + return true; + + if (u->load_state == UNIT_LOADED) + (void) unit_find_dropin_paths(u, &dropins); + if (!strv_equal(u->dropin_paths, dropins)) + return true; + + /* … any drop-ins that are masked are simply omitted from the list. */ + STRV_FOREACH(path, u->dropin_paths) + if (fragment_mtime_newer(*path, u->dropin_mtime, false)) + return true; + + return false; +} + +void unit_reset_failed(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->reset_failed) + UNIT_VTABLE(u)->reset_failed(u); + + ratelimit_reset(&u->start_ratelimit); + u->start_limit_hit = false; +} + +Unit *unit_following(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->following) + return UNIT_VTABLE(u)->following(u); + + return NULL; +} + +bool unit_stop_pending(Unit *u) { + assert(u); + + /* This call does check the current state of the unit. It's + * hence useful to be called from state change calls of the + * unit itself, where the state isn't updated yet. This is + * different from unit_inactive_or_pending() which checks both + * the current state and for a queued job. */ + + return unit_has_job_type(u, JOB_STOP); +} + +bool unit_inactive_or_pending(Unit *u) { + assert(u); + + /* Returns true if the unit is inactive or going down */ + + if (UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u))) + return true; + + if (unit_stop_pending(u)) + return true; + + return false; +} + +bool unit_active_or_pending(Unit *u) { + assert(u); + + /* Returns true if the unit is active or going up */ + + if (UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u))) + return true; + + if (u->job && + IN_SET(u->job->type, JOB_START, JOB_RELOAD_OR_START, JOB_RESTART)) + return true; + + return false; +} + +bool unit_will_restart_default(Unit *u) { + assert(u); + + return unit_has_job_type(u, JOB_START); +} + +bool unit_will_restart(Unit *u) { + assert(u); + + if (!UNIT_VTABLE(u)->will_restart) + return false; + + return UNIT_VTABLE(u)->will_restart(u); +} + +void unit_notify_cgroup_oom(Unit *u, bool managed_oom) { + assert(u); + + if (UNIT_VTABLE(u)->notify_cgroup_oom) + UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom); +} + +static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) { + _cleanup_set_free_ Set *pid_set = NULL; + int r; + + pid_set = set_new(NULL); + if (!pid_set) + return NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + if (main_pid > 0) { + r = set_put(pid_set, PID_TO_PTR(main_pid)); + if (r < 0) + return NULL; + } + + if (control_pid > 0) { + r = set_put(pid_set, PID_TO_PTR(control_pid)); + if (r < 0) + return NULL; + } + + return TAKE_PTR(pid_set); +} + +static int kill_common_log(const PidRef *pid, int signo, void *userdata) { + _cleanup_free_ char *comm = NULL; + Unit *u = ASSERT_PTR(userdata); + + (void) pidref_get_comm(pid, &comm); + + log_unit_info(u, "Sending signal SIG%s to process " PID_FMT " (%s) on client request.", + signal_to_string(signo), pid->pid, strna(comm)); + + return 1; +} + +static int kill_or_sigqueue(PidRef* pidref, int signo, int code, int value) { + assert(pidref_is_set(pidref)); + assert(SIGNAL_VALID(signo)); + + switch (code) { + + case SI_USER: + log_debug("Killing " PID_FMT " with signal SIG%s.", pidref->pid, signal_to_string(signo)); + return pidref_kill(pidref, signo); + + case SI_QUEUE: + log_debug("Enqueuing value %i to " PID_FMT " on signal SIG%s.", value, pidref->pid, signal_to_string(signo)); + return pidref_sigqueue(pidref, signo, value); + + default: + assert_not_reached(); + } +} + +int unit_kill( + Unit *u, + KillWho who, + int signo, + int code, + int value, + sd_bus_error *error) { + + PidRef *main_pid, *control_pid; + bool killed = false; + int ret = 0, r; + + /* This is the common implementation for explicit user-requested killing of unit processes, shared by + * various unit types. Do not confuse with unit_kill_context(), which is what we use when we want to + * stop a service ourselves. */ + + assert(u); + assert(who >= 0); + assert(who < _KILL_WHO_MAX); + assert(SIGNAL_VALID(signo)); + assert(IN_SET(code, SI_USER, SI_QUEUE)); + + main_pid = unit_main_pid(u); + control_pid = unit_control_pid(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u) && !main_pid && !control_pid) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing."); + + if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) { + if (!main_pid) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type)); + if (!pidref_is_set(main_pid)) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill"); + } + + if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) { + if (!control_pid) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type)); + if (!pidref_is_set(control_pid)) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill"); + } + + if (pidref_is_set(control_pid) && + IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) { + _cleanup_free_ char *comm = NULL; + (void) pidref_get_comm(control_pid, &comm); + + r = kill_or_sigqueue(control_pid, signo, code, value); + if (r < 0) { + ret = r; + + /* Report this failure both to the logs and to the client */ + sd_bus_error_set_errnof( + error, r, + "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m", + signal_to_string(signo), control_pid->pid, strna(comm)); + log_unit_warning_errno( + u, r, + "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m", + signal_to_string(signo), control_pid->pid, strna(comm)); + } else { + log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.", + signal_to_string(signo), control_pid->pid, strna(comm)); + killed = true; + } + } + + if (pidref_is_set(main_pid) && + IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) { + _cleanup_free_ char *comm = NULL; + (void) pidref_get_comm(main_pid, &comm); + + r = kill_or_sigqueue(main_pid, signo, code, value); + if (r < 0) { + if (ret == 0) { + ret = r; + + sd_bus_error_set_errnof( + error, r, + "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m", + signal_to_string(signo), main_pid->pid, strna(comm)); + } + + log_unit_warning_errno( + u, r, + "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m", + signal_to_string(signo), main_pid->pid, strna(comm)); + + } else { + log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.", + signal_to_string(signo), main_pid->pid, strna(comm)); + killed = true; + } + } + + /* Note: if we shall enqueue rather than kill we won't do this via the cgroup mechanism, since it + * doesn't really make much sense (and given that enqueued values are a relatively expensive + * resource, and we shouldn't allow us to be subjects for such allocation sprees) */ + if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path && code == SI_USER) { + _cleanup_set_free_ Set *pid_set = NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0); + if (!pid_set) + return log_oom(); + + r = cg_kill_recursive(u->cgroup_path, signo, 0, pid_set, kill_common_log, u); + if (r < 0) { + if (!IN_SET(r, -ESRCH, -ENOENT)) { + if (ret == 0) { + ret = r; + + sd_bus_error_set_errnof( + error, r, + "Failed to send signal SIG%s to auxiliary processes: %m", + signal_to_string(signo)); + } + + log_unit_warning_errno( + u, r, + "Failed to send signal SIG%s to auxiliary processes on client request: %m", + signal_to_string(signo)); + } + } else + killed = true; + } + + /* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */ + if (ret == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL)) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill"); + + return ret; +} + +int unit_following_set(Unit *u, Set **s) { + assert(u); + assert(s); + + if (UNIT_VTABLE(u)->following_set) + return UNIT_VTABLE(u)->following_set(u, s); + + *s = NULL; + return 0; +} + +UnitFileState unit_get_unit_file_state(Unit *u) { + int r; + + assert(u); + + if (u->unit_file_state < 0 && u->fragment_path) { + r = unit_file_get_state( + u->manager->runtime_scope, + NULL, + u->id, + &u->unit_file_state); + if (r < 0) + u->unit_file_state = UNIT_FILE_BAD; + } + + return u->unit_file_state; +} + +PresetAction unit_get_unit_file_preset(Unit *u) { + int r; + + assert(u); + + if (u->unit_file_preset < 0 && u->fragment_path) { + _cleanup_free_ char *bn = NULL; + + r = path_extract_filename(u->fragment_path, &bn); + if (r < 0) + return (u->unit_file_preset = r); + + if (r == O_DIRECTORY) + return (u->unit_file_preset = -EISDIR); + + u->unit_file_preset = unit_file_query_preset( + u->manager->runtime_scope, + NULL, + bn, + NULL); + } + + return u->unit_file_preset; +} + +Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target) { + assert(ref); + assert(source); + assert(target); + + if (ref->target) + unit_ref_unset(ref); + + ref->source = source; + ref->target = target; + LIST_PREPEND(refs_by_target, target->refs_by_target, ref); + return target; +} + +void unit_ref_unset(UnitRef *ref) { + assert(ref); + + if (!ref->target) + return; + + /* We are about to drop a reference to the unit, make sure the garbage collection has a look at it as it might + * be unreferenced now. */ + unit_add_to_gc_queue(ref->target); + + LIST_REMOVE(refs_by_target, ref->target->refs_by_target, ref); + ref->source = ref->target = NULL; +} + +static int user_from_unit_name(Unit *u, char **ret) { + + static const uint8_t hash_key[] = { + 0x58, 0x1a, 0xaf, 0xe6, 0x28, 0x58, 0x4e, 0x96, + 0xb4, 0x4e, 0xf5, 0x3b, 0x8c, 0x92, 0x07, 0xec + }; + + _cleanup_free_ char *n = NULL; + int r; + + r = unit_name_to_prefix(u->id, &n); + if (r < 0) + return r; + + if (valid_user_group_name(n, 0)) { + *ret = TAKE_PTR(n); + return 0; + } + + /* If we can't use the unit name as a user name, then let's hash it and use that */ + if (asprintf(ret, "_du%016" PRIx64, siphash24(n, strlen(n), hash_key)) < 0) + return -ENOMEM; + + return 0; +} + +int unit_patch_contexts(Unit *u) { + CGroupContext *cc; + ExecContext *ec; + int r; + + assert(u); + + /* Patch in the manager defaults into the exec and cgroup + * contexts, _after_ the rest of the settings have been + * initialized */ + + ec = unit_get_exec_context(u); + if (ec) { + /* This only copies in the ones that need memory */ + for (unsigned i = 0; i < _RLIMIT_MAX; i++) + if (u->manager->defaults.rlimit[i] && !ec->rlimit[i]) { + ec->rlimit[i] = newdup(struct rlimit, u->manager->defaults.rlimit[i], 1); + if (!ec->rlimit[i]) + return -ENOMEM; + } + + if (MANAGER_IS_USER(u->manager) && + !ec->working_directory) { + + r = get_home_dir(&ec->working_directory); + if (r < 0) + return r; + + /* Allow user services to run, even if the + * home directory is missing */ + ec->working_directory_missing_ok = true; + } + + if (ec->private_devices) + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); + + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + + if (ec->protect_kernel_logs) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYSLOG); + + if (ec->protect_clock) + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_SYS_TIME) | (UINT64_C(1) << CAP_WAKE_ALARM)); + + if (ec->dynamic_user) { + if (!ec->user) { + r = user_from_unit_name(u, &ec->user); + if (r < 0) + return r; + } + + if (!ec->group) { + ec->group = strdup(ec->user); + if (!ec->group) + return -ENOMEM; + } + + /* If the dynamic user option is on, let's make sure that the unit can't leave its + * UID/GID around in the file system or on IPC objects. Hence enforce a strict + * sandbox. */ + + ec->private_tmp = true; + ec->remove_ipc = true; + ec->protect_system = PROTECT_SYSTEM_STRICT; + if (ec->protect_home == PROTECT_HOME_NO) + ec->protect_home = PROTECT_HOME_READ_ONLY; + + /* Make sure this service can neither benefit from SUID/SGID binaries nor create + * them. */ + ec->no_new_privileges = true; + ec->restrict_suid_sgid = true; + } + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) + exec_directory_sort(ec->directories + dt); + } + + cc = unit_get_cgroup_context(u); + if (cc && ec) { + + if (ec->private_devices && + cc->device_policy == CGROUP_DEVICE_POLICY_AUTO) + cc->device_policy = CGROUP_DEVICE_POLICY_CLOSED; + + /* Only add these if needed, as they imply that everything else is blocked. */ + if (cc->device_policy != CGROUP_DEVICE_POLICY_AUTO || cc->device_allow) { + if (ec->root_image || ec->mount_images) { + + /* When RootImage= or MountImages= is specified, the following devices are touched. */ + FOREACH_STRING(p, "/dev/loop-control", "/dev/mapper/control") { + r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + if (r < 0) + return r; + } + FOREACH_STRING(p, "block-loop", "block-blkext", "block-device-mapper") { + r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD); + if (r < 0) + return r; + } + + /* Make sure "block-loop" can be resolved, i.e. make sure "loop" shows up in /proc/devices. + * Same for mapper and verity. */ + FOREACH_STRING(p, "modprobe@loop.service", "modprobe@dm_mod.service", "modprobe@dm_verity.service") { + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, p, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + } + + if (ec->protect_clock) { + r = cgroup_context_add_device_allow(cc, "char-rtc", CGROUP_DEVICE_READ); + if (r < 0) + return r; + } + + /* If there are encrypted credentials we might need to access the TPM. */ + if (exec_context_has_encrypted_credentials(ec)) { + r = cgroup_context_add_device_allow(cc, "char-tpm", CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + if (r < 0) + return r; + } + } + } + + return 0; +} + +ExecContext *unit_get_exec_context(const Unit *u) { + size_t offset; + assert(u); + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->exec_context_offset; + if (offset <= 0) + return NULL; + + return (ExecContext*) ((uint8_t*) u + offset); +} + +KillContext *unit_get_kill_context(Unit *u) { + size_t offset; + assert(u); + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->kill_context_offset; + if (offset <= 0) + return NULL; + + return (KillContext*) ((uint8_t*) u + offset); +} + +CGroupContext *unit_get_cgroup_context(Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->cgroup_context_offset; + if (offset <= 0) + return NULL; + + return (CGroupContext*) ((uint8_t*) u + offset); +} + +ExecRuntime *unit_get_exec_runtime(Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->exec_runtime_offset; + if (offset <= 0) + return NULL; + + return *(ExecRuntime**) ((uint8_t*) u + offset); +} + +static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) { + assert(u); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return NULL; + + if (u->transient) /* Redirect drop-ins for transient units always into the transient directory. */ + return u->manager->lookup_paths.transient; + + if (flags & UNIT_PERSISTENT) + return u->manager->lookup_paths.persistent_control; + + if (flags & UNIT_RUNTIME) + return u->manager->lookup_paths.runtime_control; + + return NULL; +} + +const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf) { + assert(s); + assert(popcount(flags & (UNIT_ESCAPE_EXEC_SYNTAX_ENV | UNIT_ESCAPE_EXEC_SYNTAX | UNIT_ESCAPE_C)) <= 1); + assert(buf); + + _cleanup_free_ char *t = NULL; + + /* Returns a string with any escaping done. If no escaping was necessary, *buf is set to NULL, and + * the input pointer is returned as-is. If an allocation was needed, the return buffer pointer is + * written to *buf. This means the return value always contains a properly escaped version, but *buf + * only contains a pointer if an allocation was made. Callers can use this to optimize memory + * allocations. */ + + if (flags & UNIT_ESCAPE_SPECIFIERS) { + t = specifier_escape(s); + if (!t) + return NULL; + + s = t; + } + + /* We either do C-escaping or shell-escaping, to additionally escape characters that we parse for + * ExecStart= and friends, i.e. '$' and quotes. */ + + if (flags & (UNIT_ESCAPE_EXEC_SYNTAX_ENV | UNIT_ESCAPE_EXEC_SYNTAX)) { + char *t2; + + if (flags & UNIT_ESCAPE_EXEC_SYNTAX_ENV) { + t2 = strreplace(s, "$", "$$"); + if (!t2) + return NULL; + free_and_replace(t, t2); + } + + t2 = shell_escape(t ?: s, "\""); + if (!t2) + return NULL; + free_and_replace(t, t2); + + s = t; + + } else if (flags & UNIT_ESCAPE_C) { + char *t2; + + t2 = cescape(s); + if (!t2) + return NULL; + free_and_replace(t, t2); + + s = t; + } + + *buf = TAKE_PTR(t); + return s; +} + +char* unit_concat_strv(char **l, UnitWriteFlags flags) { + _cleanup_free_ char *result = NULL; + size_t n = 0; + + /* Takes a list of strings, escapes them, and concatenates them. This may be used to format command + * lines in a way suitable for ExecStart= stanzas. */ + + STRV_FOREACH(i, l) { + _cleanup_free_ char *buf = NULL; + const char *p; + size_t a; + char *q; + + p = unit_escape_setting(*i, flags, &buf); + if (!p) + return NULL; + + a = (n > 0) + 1 + strlen(p) + 1; /* separating space + " + entry + " */ + if (!GREEDY_REALLOC(result, n + a + 1)) + return NULL; + + q = result + n; + if (n > 0) + *(q++) = ' '; + + *(q++) = '"'; + q = stpcpy(q, p); + *(q++) = '"'; + + n += a; + } + + if (!GREEDY_REALLOC(result, n + 1)) + return NULL; + + result[n] = 0; + + return TAKE_PTR(result); +} + +int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data) { + _cleanup_free_ char *p = NULL, *q = NULL, *escaped = NULL; + const char *dir, *wrapped; + int r; + + assert(u); + assert(name); + assert(data); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return 0; + + data = unit_escape_setting(data, flags, &escaped); + if (!data) + return -ENOMEM; + + /* Prefix the section header. If we are writing this out as transient file, then let's suppress this if the + * previous section header is the same */ + + if (flags & UNIT_PRIVATE) { + if (!UNIT_VTABLE(u)->private_section) + return -EINVAL; + + if (!u->transient_file || u->last_section_private < 0) + data = strjoina("[", UNIT_VTABLE(u)->private_section, "]\n", data); + else if (u->last_section_private == 0) + data = strjoina("\n[", UNIT_VTABLE(u)->private_section, "]\n", data); + } else { + if (!u->transient_file || u->last_section_private < 0) + data = strjoina("[Unit]\n", data); + else if (u->last_section_private > 0) + data = strjoina("\n[Unit]\n", data); + } + + if (u->transient_file) { + /* When this is a transient unit file in creation, then let's not create a new drop-in but instead + * write to the transient unit file. */ + fputs(data, u->transient_file); + + if (!endswith(data, "\n")) + fputc('\n', u->transient_file); + + /* Remember which section we wrote this entry to */ + u->last_section_private = !!(flags & UNIT_PRIVATE); + return 0; + } + + dir = unit_drop_in_dir(u, flags); + if (!dir) + return -EINVAL; + + wrapped = strjoina("# This is a drop-in unit file extension, created via \"systemctl set-property\"\n" + "# or an equivalent operation. Do not edit.\n", + data, + "\n"); + + r = drop_in_file(dir, u->id, 50, name, &p, &q); + if (r < 0) + return r; + + (void) mkdir_p_label(p, 0755); + + /* Make sure the drop-in dir is registered in our path cache. This way we don't need to stupidly + * recreate the cache after every drop-in we write. */ + if (u->manager->unit_path_cache) { + r = set_put_strdup(&u->manager->unit_path_cache, p); + if (r < 0) + return r; + } + + r = write_string_file_atomic_label(q, wrapped); + if (r < 0) + return r; + + r = strv_push(&u->dropin_paths, q); + if (r < 0) + return r; + q = NULL; + + strv_uniq(u->dropin_paths); + + u->dropin_mtime = now(CLOCK_REALTIME); + + return 0; +} + +int unit_write_settingf(Unit *u, UnitWriteFlags flags, const char *name, const char *format, ...) { + _cleanup_free_ char *p = NULL; + va_list ap; + int r; + + assert(u); + assert(name); + assert(format); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return 0; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return unit_write_setting(u, flags, name, p); +} + +int unit_make_transient(Unit *u) { + _cleanup_free_ char *path = NULL; + FILE *f; + + assert(u); + + if (!UNIT_VTABLE(u)->can_transient) + return -EOPNOTSUPP; + + (void) mkdir_p_label(u->manager->lookup_paths.transient, 0755); + + path = path_join(u->manager->lookup_paths.transient, u->id); + if (!path) + return -ENOMEM; + + /* Let's open the file we'll write the transient settings into. This file is kept open as long as we are + * creating the transient, and is closed in unit_load(), as soon as we start loading the file. */ + + WITH_UMASK(0022) { + f = fopen(path, "we"); + if (!f) + return -errno; + } + + safe_fclose(u->transient_file); + u->transient_file = f; + + free_and_replace(u->fragment_path, path); + + u->source_path = mfree(u->source_path); + u->dropin_paths = strv_free(u->dropin_paths); + u->fragment_mtime = u->source_mtime = u->dropin_mtime = 0; + + u->load_state = UNIT_STUB; + u->load_error = 0; + u->transient = true; + + unit_add_to_dbus_queue(u); + unit_add_to_gc_queue(u); + + fputs("# This is a transient unit file, created programmatically via the systemd API. Do not edit.\n", + u->transient_file); + + return 0; +} + +static int log_kill(const PidRef *pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + assert(pidref_is_set(pid)); + + (void) pidref_get_comm(pid, &comm); + + /* Don't log about processes marked with brackets, under the assumption that these are temporary processes + only, like for example systemd's own PAM stub process. */ + if (comm && comm[0] == '(') + /* Although we didn't log anything, as this callback is used in unit_kill_context we must return 1 + * here to let the manager know that a process was killed. */ + return 1; + + log_unit_notice(userdata, + "Killing process " PID_FMT " (%s) with signal SIG%s.", + pid->pid, + strna(comm), + signal_to_string(sig)); + + return 1; +} + +static int operation_to_signal( + const KillContext *c, + KillOperation k, + bool *ret_noteworthy) { + + assert(c); + + switch (k) { + + case KILL_TERMINATE: + case KILL_TERMINATE_AND_LOG: + *ret_noteworthy = false; + return c->kill_signal; + + case KILL_RESTART: + *ret_noteworthy = false; + return restart_kill_signal(c); + + case KILL_KILL: + *ret_noteworthy = true; + return c->final_kill_signal; + + case KILL_WATCHDOG: + *ret_noteworthy = true; + return c->watchdog_signal; + + default: + assert_not_reached(); + } +} + +int unit_kill_context( + Unit *u, + KillContext *c, + KillOperation k, + PidRef* main_pid, + PidRef* control_pid, + bool main_pid_alien) { + + bool wait_for_exit = false, send_sighup; + cg_kill_log_func_t log_func = NULL; + int sig, r; + + assert(u); + assert(c); + + /* Kill the processes belonging to this unit, in preparation for shutting the unit down. Returns > 0 + * if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common() + * which is used for user-requested killing of unit processes. */ + + if (c->kill_mode == KILL_NONE) + return 0; + + bool noteworthy; + sig = operation_to_signal(c, k, ¬eworthy); + if (noteworthy) + log_func = log_kill; + + send_sighup = + c->send_sighup && + IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) && + sig != SIGHUP; + + if (pidref_is_set(main_pid)) { + if (log_func) + log_func(main_pid, sig, u); + + r = pidref_kill_and_sigcont(main_pid, sig); + if (r < 0 && r != -ESRCH) { + _cleanup_free_ char *comm = NULL; + (void) pidref_get_comm(main_pid, &comm); + + log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid->pid, strna(comm)); + } else { + if (!main_pid_alien) + wait_for_exit = true; + + if (r != -ESRCH && send_sighup) + (void) pidref_kill(main_pid, SIGHUP); + } + } + + if (pidref_is_set(control_pid)) { + if (log_func) + log_func(control_pid, sig, u); + + r = pidref_kill_and_sigcont(control_pid, sig); + if (r < 0 && r != -ESRCH) { + _cleanup_free_ char *comm = NULL; + (void) pidref_get_comm(control_pid, &comm); + + log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid->pid, strna(comm)); + } else { + wait_for_exit = true; + + if (r != -ESRCH && send_sighup) + (void) pidref_kill(control_pid, SIGHUP); + } + } + + if (u->cgroup_path && + (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) { + _cleanup_set_free_ Set *pid_set = NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0); + if (!pid_set) + return -ENOMEM; + + r = cg_kill_recursive( + u->cgroup_path, + sig, + CGROUP_SIGCONT|CGROUP_IGNORE_SELF, + pid_set, + log_func, u); + if (r < 0) { + if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT)) + log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(u->cgroup_path)); + + } else if (r > 0) { + + /* FIXME: For now, on the legacy hierarchy, we will not wait for the cgroup members to die if + * we are running in a container or if this is a delegation unit, simply because cgroup + * notification is unreliable in these cases. It doesn't work at all in containers, and outside + * of containers it can be confused easily by left-over directories in the cgroup — which + * however should not exist in non-delegated units. On the unified hierarchy that's different, + * there we get proper events. Hence rely on them. */ + + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0 || + (detect_container() == 0 && !unit_cgroup_delegate(u))) + wait_for_exit = true; + + if (send_sighup) { + set_free(pid_set); + + pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0); + if (!pid_set) + return -ENOMEM; + + (void) cg_kill_recursive( + u->cgroup_path, + SIGHUP, + CGROUP_IGNORE_SELF, + pid_set, + /* kill_log= */ NULL, + /* userdata= */ NULL); + } + } + } + + return wait_for_exit; +} + +int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) { + int r; + + assert(u); + assert(path); + + /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these + * paths in the unit (from the path to the UnitDependencyInfo structure indicating how to the + * dependency came to be). However, we build a prefix table for all possible prefixes so that new + * appearing mount units can easily determine which units to make themselves a dependency of. */ + + if (!path_is_absolute(path)) + return -EINVAL; + + if (hashmap_contains(u->requires_mounts_for, path)) /* Exit quickly if the path is already covered. */ + return 0; + + /* Use the canonical form of the path as the stored key. We call path_is_normalized() + * only after simplification, since path_is_normalized() rejects paths with '.'. + * path_is_normalized() also verifies that the path fits in PATH_MAX. */ + _cleanup_free_ char *p = NULL; + r = path_simplify_alloc(path, &p); + if (r < 0) + return r; + path = p; + + if (!path_is_normalized(path)) + return -EPERM; + + UnitDependencyInfo di = { + .origin_mask = mask + }; + + r = hashmap_ensure_put(&u->requires_mounts_for, &path_hash_ops, p, di.data); + if (r < 0) + return r; + assert(r > 0); + TAKE_PTR(p); /* path remains a valid pointer to the string stored in the hashmap */ + + char prefix[strlen(path) + 1]; + PATH_FOREACH_PREFIX_MORE(prefix, path) { + Set *x; + + x = hashmap_get(u->manager->units_requiring_mounts_for, prefix); + if (!x) { + _cleanup_free_ char *q = NULL; + + r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops); + if (r < 0) + return r; + + q = strdup(prefix); + if (!q) + return -ENOMEM; + + x = set_new(NULL); + if (!x) + return -ENOMEM; + + r = hashmap_put(u->manager->units_requiring_mounts_for, q, x); + if (r < 0) { + set_free(x); + return r; + } + q = NULL; + } + + r = set_put(x, u); + if (r < 0) + return r; + } + + return 0; +} + +int unit_setup_exec_runtime(Unit *u) { + _cleanup_(exec_shared_runtime_unrefp) ExecSharedRuntime *esr = NULL; + _cleanup_(dynamic_creds_unrefp) DynamicCreds *dcreds = NULL; + _cleanup_set_free_ Set *units = NULL; + ExecRuntime **rt; + ExecContext *ec; + size_t offset; + Unit *other; + int r; + + offset = UNIT_VTABLE(u)->exec_runtime_offset; + assert(offset > 0); + + /* Check if there already is an ExecRuntime for this unit? */ + rt = (ExecRuntime**) ((uint8_t*) u + offset); + if (*rt) + return 0; + + ec = unit_get_exec_context(u); + assert(ec); + + r = unit_get_transitive_dependency_set(u, UNIT_ATOM_JOINS_NAMESPACE_OF, &units); + if (r < 0) + return r; + + /* Try to get it from somebody else */ + SET_FOREACH(other, units) { + r = exec_shared_runtime_acquire(u->manager, NULL, other->id, false, &esr); + if (r < 0) + return r; + if (r > 0) + break; + } + + if (!esr) { + r = exec_shared_runtime_acquire(u->manager, ec, u->id, true, &esr); + if (r < 0) + return r; + } + + if (ec->dynamic_user) { + r = dynamic_creds_make(u->manager, ec->user, ec->group, &dcreds); + if (r < 0) + return r; + } + + r = exec_runtime_make(u, ec, esr, dcreds, rt); + if (r < 0) + return r; + + TAKE_PTR(esr); + TAKE_PTR(dcreds); + + return r; +} + +bool unit_type_supported(UnitType t) { + static int8_t cache[_UNIT_TYPE_MAX] = {}; /* -1: disabled, 1: enabled: 0: don't know */ + int r; + + assert(t >= 0 && t < _UNIT_TYPE_MAX); + + if (cache[t] == 0) { + char *e; + + e = strjoina("SYSTEMD_SUPPORT_", unit_type_to_string(t)); + + r = getenv_bool(ascii_strupper(e)); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $%s, ignoring: %m", e); + + cache[t] = r == 0 ? -1 : 1; + } + if (cache[t] < 0) + return false; + + if (!unit_vtable[t]->supported) + return true; + + return unit_vtable[t]->supported(); +} + +void unit_warn_if_dir_nonempty(Unit *u, const char* where) { + int r; + + assert(u); + assert(where); + + if (!unit_log_level_test(u, LOG_NOTICE)) + return; + + r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false); + if (r > 0 || r == -ENOTDIR) + return; + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to check directory %s: %m", where); + return; + } + + log_unit_struct(u, LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where), + "WHERE=%s", where); +} + +int unit_fail_if_noncanonical(Unit *u, const char* where) { + _cleanup_free_ char *canonical_where = NULL; + int r; + + assert(u); + assert(where); + + r = chase(where, NULL, CHASE_NONEXISTENT, &canonical_where, NULL); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to check %s for symlinks, ignoring: %m", where); + return 0; + } + + /* We will happily ignore a trailing slash (or any redundant slashes) */ + if (path_equal(where, canonical_where)) + return 0; + + /* No need to mention "." or "..", they would already have been rejected by unit_name_from_path() */ + log_unit_struct(u, LOG_ERR, + "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Mount path %s is not canonical (contains a symlink).", where), + "WHERE=%s", where); + + return -ELOOP; +} + +bool unit_is_pristine(Unit *u) { + assert(u); + + /* Check if the unit already exists or is already around, in a number of different ways. Note that to + * cater for unit types such as slice, we are generally fine with units that are marked UNIT_LOADED + * even though nothing was actually loaded, as those unit types don't require a file on disk. + * + * Note that we don't check for drop-ins here, because we allow drop-ins for transient units + * identically to non-transient units, both unit-specific and hierarchical. E.g. for a-b-c.service: + * service.d/….conf, a-.service.d/….conf, a-b-.service.d/….conf, a-b-c.service.d/….conf. + */ + + return IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_LOADED) && + !u->fragment_path && + !u->source_path && + !u->job && + !u->merged_into; +} + +PidRef* unit_control_pid(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->control_pid) + return UNIT_VTABLE(u)->control_pid(u); + + return NULL; +} + +PidRef* unit_main_pid(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->main_pid) + return UNIT_VTABLE(u)->main_pid(u); + + return NULL; +} + +static void unit_modify_user_nft_set(Unit *u, bool add, NFTSetSource source, uint32_t element) { + int r; + + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + CGroupContext *c; + c = unit_get_cgroup_context(u); + if (!c) + return; + + if (!u->manager->fw_ctx) { + r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false); + if (r < 0) + return; + + assert(u->manager->fw_ctx); + } + + FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) { + if (nft_set->source != source) + continue; + + r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element)); + if (r < 0) + log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, ID %u, ignoring: %m", + add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, element); + else + log_debug("%s NFT set: family %s, table %s, set %s, ID %u", + add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, element); + } +} + +static void unit_unref_uid_internal( + Unit *u, + uid_t *ref_uid, + bool destroy_now, + void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) { + + assert(u); + assert(ref_uid); + assert(_manager_unref_uid); + + /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and + * gid_t are actually the same time, with the same validity rules. + * + * Drops a reference to UID/GID from a unit. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (!uid_is_valid(*ref_uid)) + return; + + _manager_unref_uid(u->manager, *ref_uid, destroy_now); + *ref_uid = UID_INVALID; +} + +static void unit_unref_uid(Unit *u, bool destroy_now) { + assert(u); + + unit_modify_user_nft_set(u, /* add = */ false, NFT_SET_SOURCE_USER, u->ref_uid); + + unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid); +} + +static void unit_unref_gid(Unit *u, bool destroy_now) { + assert(u); + + unit_modify_user_nft_set(u, /* add = */ false, NFT_SET_SOURCE_GROUP, u->ref_gid); + + unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid); +} + +void unit_unref_uid_gid(Unit *u, bool destroy_now) { + assert(u); + + unit_unref_uid(u, destroy_now); + unit_unref_gid(u, destroy_now); +} + +static int unit_ref_uid_internal( + Unit *u, + uid_t *ref_uid, + uid_t uid, + bool clean_ipc, + int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) { + + int r; + + assert(u); + assert(ref_uid); + assert(uid_is_valid(uid)); + assert(_manager_ref_uid); + + /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t + * are actually the same type, and have the same validity rules. + * + * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a + * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter + * drops to zero. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (*ref_uid == uid) + return 0; + + if (uid_is_valid(*ref_uid)) /* Already set? */ + return -EBUSY; + + r = _manager_ref_uid(u->manager, uid, clean_ipc); + if (r < 0) + return r; + + *ref_uid = uid; + return 1; +} + +static int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) { + return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid); +} + +static int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) { + return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid); +} + +static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) { + int r = 0, q = 0; + + assert(u); + + /* Reference both a UID and a GID in one go. Either references both, or neither. */ + + if (uid_is_valid(uid)) { + r = unit_ref_uid(u, uid, clean_ipc); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + q = unit_ref_gid(u, gid, clean_ipc); + if (q < 0) { + if (r > 0) + unit_unref_uid(u, false); + + return q; + } + } + + return r > 0 || q > 0; +} + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) { + ExecContext *c; + int r; + + assert(u); + + c = unit_get_exec_context(u); + + r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false); + if (r < 0) + return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m"); + + unit_modify_user_nft_set(u, /* add = */ true, NFT_SET_SOURCE_USER, uid); + unit_modify_user_nft_set(u, /* add = */ true, NFT_SET_SOURCE_GROUP, gid); + + return r; +} + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) { + int r; + + assert(u); + + /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names + * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC + * objects when no service references the UID/GID anymore. */ + + r = unit_ref_uid_gid(u, uid, gid); + if (r > 0) + unit_add_to_dbus_queue(u); +} + +int unit_acquire_invocation_id(Unit *u) { + sd_id128_t id; + int r; + + assert(u); + + r = sd_id128_randomize(&id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m"); + + r = unit_set_invocation_id(u, id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m"); + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_set_exec_params(Unit *u, ExecParameters *p) { + const char *confirm_spawn; + int r; + + assert(u); + assert(p); + + /* Copy parameters from manager */ + r = manager_get_effective_environment(u->manager, &p->environment); + if (r < 0) + return r; + + p->runtime_scope = u->manager->runtime_scope; + + confirm_spawn = manager_get_confirm_spawn(u->manager); + if (confirm_spawn) { + p->confirm_spawn = strdup(confirm_spawn); + if (!p->confirm_spawn) + return -ENOMEM; + } + + p->cgroup_supported = u->manager->cgroup_supported; + p->prefix = u->manager->prefix; + SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager)); + + /* Copy parameters from unit */ + p->cgroup_path = u->cgroup_path; + SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u)); + + p->received_credentials_directory = u->manager->received_credentials_directory; + p->received_encrypted_credentials_directory = u->manager->received_encrypted_credentials_directory; + + p->shall_confirm_spawn = u->manager->confirm_spawn; + + p->fallback_smack_process_label = u->manager->defaults.smack_process_label; + + if (u->manager->restrict_fs && p->bpf_outer_map_fd < 0) { + int fd = lsm_bpf_map_restrict_fs_fd(u); + if (fd < 0) + return fd; + + p->bpf_outer_map_fd = fd; + } + + p->user_lookup_fd = u->manager->user_lookup_fds[1]; + + p->cgroup_id = u->cgroup_id; + p->invocation_id = u->invocation_id; + sd_id128_to_string(p->invocation_id, p->invocation_id_string); + p->unit_id = strdup(u->id); + if (!p->unit_id) + return -ENOMEM; + + return 0; +} + +int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) { + pid_t pid; + int r; + + assert(u); + assert(ret); + + /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child, + * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */ + + (void) unit_realize_cgroup(u); + + r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid); + if (r < 0) + return r; + if (r > 0) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + int q; + + /* Parent */ + + q = pidref_set_pid(&pidref, pid); + if (q < 0) + return q; + + *ret = TAKE_PIDREF(pidref); + return r; + } + + /* Child */ + + (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE); + (void) ignore_signals(SIGPIPE); + + if (u->cgroup_path) { + r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(u->cgroup_path)); + _exit(EXIT_CGROUP); + } + } + + return 0; +} + +int unit_fork_and_watch_rm_rf(Unit *u, char **paths, PidRef *ret_pid) { + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + int r; + + assert(u); + assert(ret_pid); + + r = unit_fork_helper_process(u, "(sd-rmrf)", &pid); + if (r < 0) + return r; + if (r == 0) { + int ret = EXIT_SUCCESS; + + STRV_FOREACH(i, paths) { + r = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK); + if (r < 0) { + log_error_errno(r, "Failed to remove '%s': %m", *i); + ret = EXIT_FAILURE; + } + } + + _exit(ret); + } + + r = unit_watch_pidref(u, &pid, /* exclusive= */ true); + if (r < 0) + return r; + + *ret_pid = TAKE_PIDREF(pid); + return 0; +} + +static void unit_update_dependency_mask(Hashmap *deps, Unit *other, UnitDependencyInfo di) { + assert(deps); + assert(other); + + if (di.origin_mask == 0 && di.destination_mask == 0) + /* No bit set anymore, let's drop the whole entry */ + assert_se(hashmap_remove(deps, other)); + else + /* Mask was reduced, let's update the entry */ + assert_se(hashmap_update(deps, other, di.data) == 0); +} + +void unit_remove_dependencies(Unit *u, UnitDependencyMask mask) { + Hashmap *deps; + assert(u); + + /* Removes all dependencies u has on other units marked for ownership by 'mask'. */ + + if (mask == 0) + return; + + HASHMAP_FOREACH(deps, u->dependencies) { + bool done; + + do { + UnitDependencyInfo di; + Unit *other; + + done = true; + + HASHMAP_FOREACH_KEY(di.data, other, deps) { + Hashmap *other_deps; + + if (FLAGS_SET(~mask, di.origin_mask)) + continue; + + di.origin_mask &= ~mask; + unit_update_dependency_mask(deps, other, di); + + /* We updated the dependency from our unit to the other unit now. But most + * dependencies imply a reverse dependency. Hence, let's delete that one + * too. For that we go through all dependency types on the other unit and + * delete all those which point to us and have the right mask set. */ + + HASHMAP_FOREACH(other_deps, other->dependencies) { + UnitDependencyInfo dj; + + dj.data = hashmap_get(other_deps, u); + if (FLAGS_SET(~mask, dj.destination_mask)) + continue; + + dj.destination_mask &= ~mask; + unit_update_dependency_mask(other_deps, u, dj); + } + + unit_add_to_gc_queue(other); + + /* The unit 'other' may not be wanted by the unit 'u'. */ + unit_submit_to_stop_when_unneeded_queue(other); + + done = false; + break; + } + + } while (!done); + } +} + +static int unit_get_invocation_path(Unit *u, char **ret) { + char *p; + int r; + + assert(u); + assert(ret); + + if (MANAGER_IS_SYSTEM(u->manager)) + p = strjoin("/run/systemd/units/invocation:", u->id); + else { + _cleanup_free_ char *user_path = NULL; + r = xdg_user_runtime_dir(&user_path, "/systemd/units/invocation:"); + if (r < 0) + return r; + p = strjoin(user_path, u->id); + } + + if (!p) + return -ENOMEM; + + *ret = p; + return 0; +} + +static int unit_export_invocation_id(Unit *u) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + if (u->exported_invocation_id) + return 0; + + if (sd_id128_is_null(u->invocation_id)) + return 0; + + r = unit_get_invocation_path(u, &p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to get invocation path: %m"); + + r = symlink_atomic_label(u->invocation_id_string, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create invocation ID symlink %s: %m", p); + + u->exported_invocation_id = true; + return 0; +} + +static int unit_export_log_level_max(Unit *u, const ExecContext *c) { + const char *p; + char buf[2]; + int r; + + assert(u); + assert(c); + + if (u->exported_log_level_max) + return 0; + + if (c->log_level_max < 0) + return 0; + + assert(c->log_level_max <= 7); + + buf[0] = '0' + c->log_level_max; + buf[1] = 0; + + p = strjoina("/run/systemd/units/log-level-max:", u->id); + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create maximum log level symlink %s: %m", p); + + u->exported_log_level_max = true; + return 0; +} + +static int unit_export_log_extra_fields(Unit *u, const ExecContext *c) { + _cleanup_close_ int fd = -EBADF; + struct iovec *iovec; + const char *p; + char *pattern; + le64_t *sizes; + ssize_t n; + int r; + + if (u->exported_log_extra_fields) + return 0; + + if (c->n_log_extra_fields <= 0) + return 0; + + sizes = newa(le64_t, c->n_log_extra_fields); + iovec = newa(struct iovec, c->n_log_extra_fields * 2); + + for (size_t i = 0; i < c->n_log_extra_fields; i++) { + sizes[i] = htole64(c->log_extra_fields[i].iov_len); + + iovec[i*2] = IOVEC_MAKE(sizes + i, sizeof(le64_t)); + iovec[i*2+1] = c->log_extra_fields[i]; + } + + p = strjoina("/run/systemd/units/log-extra-fields:", u->id); + pattern = strjoina(p, ".XXXXXX"); + + fd = mkostemp_safe(pattern); + if (fd < 0) + return log_unit_debug_errno(u, fd, "Failed to create extra fields file %s: %m", p); + + n = writev(fd, iovec, c->n_log_extra_fields*2); + if (n < 0) { + r = log_unit_debug_errno(u, errno, "Failed to write extra fields: %m"); + goto fail; + } + + (void) fchmod(fd, 0644); + + if (rename(pattern, p) < 0) { + r = log_unit_debug_errno(u, errno, "Failed to rename extra fields file: %m"); + goto fail; + } + + u->exported_log_extra_fields = true; + return 0; + +fail: + (void) unlink(pattern); + return r; +} + +static int unit_export_log_ratelimit_interval(Unit *u, const ExecContext *c) { + _cleanup_free_ char *buf = NULL; + const char *p; + int r; + + assert(u); + assert(c); + + if (u->exported_log_ratelimit_interval) + return 0; + + if (c->log_ratelimit_interval_usec == 0) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id); + + if (asprintf(&buf, "%" PRIu64, c->log_ratelimit_interval_usec) < 0) + return log_oom(); + + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create log rate limit interval symlink %s: %m", p); + + u->exported_log_ratelimit_interval = true; + return 0; +} + +static int unit_export_log_ratelimit_burst(Unit *u, const ExecContext *c) { + _cleanup_free_ char *buf = NULL; + const char *p; + int r; + + assert(u); + assert(c); + + if (u->exported_log_ratelimit_burst) + return 0; + + if (c->log_ratelimit_burst == 0) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id); + + if (asprintf(&buf, "%u", c->log_ratelimit_burst) < 0) + return log_oom(); + + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create log rate limit burst symlink %s: %m", p); + + u->exported_log_ratelimit_burst = true; + return 0; +} + +void unit_export_state_files(Unit *u) { + const ExecContext *c; + + assert(u); + + if (!u->id) + return; + + if (MANAGER_IS_TEST_RUN(u->manager)) + return; + + /* Exports a couple of unit properties to /run/systemd/units/, so that journald can quickly query this data + * from there. Ideally, journald would use IPC to query this, like everybody else, but that's hard, as long as + * the IPC system itself and PID 1 also log to the journal. + * + * Note that these files really shouldn't be considered API for anyone else, as use a runtime file system as + * IPC replacement is not compatible with today's world of file system namespaces. However, this doesn't really + * apply to communication between the journal and systemd, as we assume that these two daemons live in the same + * namespace at least. + * + * Note that some of the "files" exported here are actually symlinks and not regular files. Symlinks work + * better for storing small bits of data, in particular as we can write them with two system calls, and read + * them with one. */ + + (void) unit_export_invocation_id(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + c = unit_get_exec_context(u); + if (c) { + (void) unit_export_log_level_max(u, c); + (void) unit_export_log_extra_fields(u, c); + (void) unit_export_log_ratelimit_interval(u, c); + (void) unit_export_log_ratelimit_burst(u, c); + } +} + +void unit_unlink_state_files(Unit *u) { + const char *p; + + assert(u); + + if (!u->id) + return; + + /* Undoes the effect of unit_export_state() */ + + if (u->exported_invocation_id) { + _cleanup_free_ char *invocation_path = NULL; + int r = unit_get_invocation_path(u, &invocation_path); + if (r >= 0) { + (void) unlink(invocation_path); + u->exported_invocation_id = false; + } + } + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (u->exported_log_level_max) { + p = strjoina("/run/systemd/units/log-level-max:", u->id); + (void) unlink(p); + + u->exported_log_level_max = false; + } + + if (u->exported_log_extra_fields) { + p = strjoina("/run/systemd/units/extra-fields:", u->id); + (void) unlink(p); + + u->exported_log_extra_fields = false; + } + + if (u->exported_log_ratelimit_interval) { + p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id); + (void) unlink(p); + + u->exported_log_ratelimit_interval = false; + } + + if (u->exported_log_ratelimit_burst) { + p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id); + (void) unlink(p); + + u->exported_log_ratelimit_burst = false; + } +} + +int unit_prepare_exec(Unit *u) { + int r; + + assert(u); + + /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable. + * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */ + r = bpf_firewall_load_custom(u); + if (r < 0) + return r; + + /* Prepares everything so that we can fork of a process for this unit */ + + (void) unit_realize_cgroup(u); + + if (u->reset_accounting) { + (void) unit_reset_accounting(u); + u->reset_accounting = false; + } + + unit_export_state_files(u); + + r = unit_setup_exec_runtime(u); + if (r < 0) + return r; + + return 0; +} + +static bool ignore_leftover_process(const char *comm) { + return comm && comm[0] == '('; /* Most likely our own helper process (PAM?), ignore */ +} + +int unit_log_leftover_process_start(const PidRef *pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + assert(pidref_is_set(pid)); + + (void) pidref_get_comm(pid, &comm); + + if (ignore_leftover_process(comm)) + return 0; + + /* During start we print a warning */ + + log_unit_warning(userdata, + "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n" + "This usually indicates unclean termination of a previous run, or service implementation deficiencies.", + pid->pid, strna(comm)); + + return 1; +} + +int unit_log_leftover_process_stop(const PidRef *pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + assert(pidref_is_set(pid)); + + (void) pidref_get_comm(pid, &comm); + + if (ignore_leftover_process(comm)) + return 0; + + /* During stop we only print an informational message */ + + log_unit_info(userdata, + "Unit process " PID_FMT " (%s) remains running after unit stopped.", + pid->pid, strna(comm)); + + return 1; +} + +int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) { + assert(u); + + (void) unit_pick_cgroup_path(u); + + if (!u->cgroup_path) + return 0; + + return cg_kill_recursive( + u->cgroup_path, + /* sig= */ 0, + /* flags= */ 0, + /* set= */ NULL, + log_func, + u); +} + +bool unit_needs_console(Unit *u) { + ExecContext *ec; + UnitActiveState state; + + assert(u); + + state = unit_active_state(u); + + if (UNIT_IS_INACTIVE_OR_FAILED(state)) + return false; + + if (UNIT_VTABLE(u)->needs_console) + return UNIT_VTABLE(u)->needs_console(u); + + /* If this unit type doesn't implement this call, let's use a generic fallback implementation: */ + ec = unit_get_exec_context(u); + if (!ec) + return false; + + return exec_context_may_touch_console(ec); +} + +int unit_pid_attachable(Unit *u, PidRef *pid, sd_bus_error *error) { + int r; + + assert(u); + + /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself, + * and not a kernel thread either */ + + /* First, a simple range check */ + if (!pidref_is_set(pid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier is not valid."); + + /* Some extra safety check */ + if (pid->pid == 1 || pidref_is_self(pid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager process, refusing.", pid->pid); + + /* Don't even begin to bother with kernel threads */ + r = pidref_is_kernel_thread(pid); + if (r == -ESRCH) + return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid->pid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid->pid); + if (r > 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid->pid); + + return 0; +} + +void unit_log_success(Unit *u) { + assert(u); + + /* Let's show message "Deactivated successfully" in debug mode (when manager is user) rather than in info mode. + * This message has low information value for regular users and it might be a bit overwhelming on a system with + * a lot of devices. */ + log_unit_struct(u, + MANAGER_IS_USER(u->manager) ? LOG_DEBUG : LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_SUCCESS_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Deactivated successfully.")); +} + +void unit_log_failure(Unit *u, const char *result) { + assert(u); + assert(result); + + log_unit_struct(u, LOG_WARNING, + "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILURE_RESULT_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Failed with result '%s'.", result), + "UNIT_RESULT=%s", result); +} + +void unit_log_skip(Unit *u, const char *result) { + assert(u); + assert(result); + + log_unit_struct(u, LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_SKIPPED_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Skipped due to '%s'.", result), + "UNIT_RESULT=%s", result); +} + +void unit_log_process_exit( + Unit *u, + const char *kind, + const char *command, + bool success, + int code, + int status) { + + int level; + + assert(u); + assert(kind); + + /* If this is a successful exit, let's log about the exit code on DEBUG level. If this is a failure + * and the process exited on its own via exit(), then let's make this a NOTICE, under the assumption + * that the service already logged the reason at a higher log level on its own. Otherwise, make it a + * WARNING. */ + if (success) + level = LOG_DEBUG; + else if (code == CLD_EXITED) + level = LOG_NOTICE; + else + level = LOG_WARNING; + + log_unit_struct(u, level, + "MESSAGE_ID=" SD_MESSAGE_UNIT_PROCESS_EXIT_STR, + LOG_UNIT_MESSAGE(u, "%s exited, code=%s, status=%i/%s%s", + kind, + sigchld_code_to_string(code), status, + strna(code == CLD_EXITED + ? exit_status_to_string(status, EXIT_STATUS_FULL) + : signal_to_string(status)), + success ? " (success)" : ""), + "EXIT_CODE=%s", sigchld_code_to_string(code), + "EXIT_STATUS=%i", status, + "COMMAND=%s", strna(command), + LOG_UNIT_INVOCATION_ID(u)); +} + +int unit_exit_status(Unit *u) { + assert(u); + + /* Returns the exit status to propagate for the most recent cycle of this unit. Returns a value in the range + * 0…255 if there's something to propagate. EOPNOTSUPP if the concept does not apply to this unit type, ENODATA + * if no data is currently known (for example because the unit hasn't deactivated yet) and EBADE if the main + * service process has exited abnormally (signal/coredump). */ + + if (!UNIT_VTABLE(u)->exit_status) + return -EOPNOTSUPP; + + return UNIT_VTABLE(u)->exit_status(u); +} + +int unit_failure_action_exit_status(Unit *u) { + int r; + + assert(u); + + /* Returns the exit status to propagate on failure, or an error if there's nothing to propagate */ + + if (u->failure_action_exit_status >= 0) + return u->failure_action_exit_status; + + r = unit_exit_status(u); + if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */ + return 255; + + return r; +} + +int unit_success_action_exit_status(Unit *u) { + int r; + + assert(u); + + /* Returns the exit status to propagate on success, or an error if there's nothing to propagate */ + + if (u->success_action_exit_status >= 0) + return u->success_action_exit_status; + + r = unit_exit_status(u); + if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */ + return 255; + + return r; +} + +int unit_test_trigger_loaded(Unit *u) { + Unit *trigger; + + /* Tests whether the unit to trigger is loaded */ + + trigger = UNIT_TRIGGER(u); + if (!trigger) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), + "Refusing to start, no unit to trigger."); + if (trigger->load_state != UNIT_LOADED) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), + "Refusing to start, unit %s to trigger not loaded.", trigger->id); + + return 0; +} + +void unit_destroy_runtime_data(Unit *u, const ExecContext *context) { + assert(u); + assert(context); + + /* EXEC_PRESERVE_RESTART is handled via unit_release_resources()! */ + if (context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO) + exec_context_destroy_runtime_directory(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + exec_context_destroy_credentials(u); + exec_context_destroy_mount_ns_dir(u); +} + +int unit_clean(Unit *u, ExecCleanMask mask) { + UnitActiveState state; + + assert(u); + + /* Special return values: + * + * -EOPNOTSUPP → cleaning not supported for this unit type + * -EUNATCH → cleaning not defined for this resource type + * -EBUSY → unit currently can't be cleaned since it's running or not properly loaded, or has + * a job queued or similar + */ + + if (!UNIT_VTABLE(u)->clean) + return -EOPNOTSUPP; + + if (mask == 0) + return -EUNATCH; + + if (u->load_state != UNIT_LOADED) + return -EBUSY; + + if (u->job) + return -EBUSY; + + state = unit_active_state(u); + if (state != UNIT_INACTIVE) + return -EBUSY; + + return UNIT_VTABLE(u)->clean(u, mask); +} + +int unit_can_clean(Unit *u, ExecCleanMask *ret) { + assert(u); + + if (!UNIT_VTABLE(u)->clean || + u->load_state != UNIT_LOADED) { + *ret = 0; + return 0; + } + + /* When the clean() method is set, can_clean() really should be set too */ + assert(UNIT_VTABLE(u)->can_clean); + + return UNIT_VTABLE(u)->can_clean(u, ret); +} + +bool unit_can_start_refuse_manual(Unit *u) { + return unit_can_start(u) && !u->refuse_manual_start; +} + +bool unit_can_stop_refuse_manual(Unit *u) { + return unit_can_stop(u) && !u->refuse_manual_stop; +} + +bool unit_can_isolate_refuse_manual(Unit *u) { + return unit_can_isolate(u) && !u->refuse_manual_start; +} + +bool unit_can_freeze(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->can_freeze) + return UNIT_VTABLE(u)->can_freeze(u); + + return UNIT_VTABLE(u)->freeze; +} + +void unit_frozen(Unit *u) { + assert(u); + + u->freezer_state = FREEZER_FROZEN; + + bus_unit_send_pending_freezer_message(u, false); +} + +void unit_thawed(Unit *u) { + assert(u); + + u->freezer_state = FREEZER_RUNNING; + + bus_unit_send_pending_freezer_message(u, false); +} + +static int unit_freezer_action(Unit *u, FreezerAction action) { + UnitActiveState s; + int (*method)(Unit*); + int r; + + assert(u); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw; + if (!method || !cg_freezer_supported()) + return -EOPNOTSUPP; + + if (u->job) + return -EBUSY; + + if (u->load_state != UNIT_LOADED) + return -EHOSTDOWN; + + s = unit_active_state(u); + if (s != UNIT_ACTIVE) + return -EHOSTDOWN; + + if ((IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING) && action == FREEZER_FREEZE) || + (u->freezer_state == FREEZER_THAWING && action == FREEZER_THAW)) + return -EALREADY; + + r = method(u); + if (r <= 0) + return r; + + assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)); + + return 1; +} + +int unit_freeze(Unit *u) { + return unit_freezer_action(u, FREEZER_FREEZE); +} + +int unit_thaw(Unit *u) { + return unit_freezer_action(u, FREEZER_THAW); +} + +/* Wrappers around low-level cgroup freezer operations common for service and scope units */ +int unit_freeze_vtable_common(Unit *u) { + return unit_cgroup_freezer_action(u, FREEZER_FREEZE); +} + +int unit_thaw_vtable_common(Unit *u) { + return unit_cgroup_freezer_action(u, FREEZER_THAW); +} + +Condition *unit_find_failed_condition(Unit *u) { + Condition *failed_trigger = NULL; + bool has_succeeded_trigger = false; + + if (u->condition_result) + return NULL; + + LIST_FOREACH(conditions, c, u->conditions) + if (c->trigger) { + if (c->result == CONDITION_SUCCEEDED) + has_succeeded_trigger = true; + else if (!failed_trigger) + failed_trigger = c; + } else if (c->result != CONDITION_SUCCEEDED) + return c; + + return failed_trigger && !has_succeeded_trigger ? failed_trigger : NULL; +} + +static const char* const collect_mode_table[_COLLECT_MODE_MAX] = { + [COLLECT_INACTIVE] = "inactive", + [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed", +}; + +DEFINE_STRING_TABLE_LOOKUP(collect_mode, CollectMode); + +Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other) { + Unit *i; + + assert(u); + + /* Checks if the unit has a dependency on 'other' with the specified dependency atom. If 'other' is + * NULL checks if the unit has *any* dependency of that atom. Returns 'other' if found (or if 'other' + * is NULL the first entry found), or NULL if not found. */ + + UNIT_FOREACH_DEPENDENCY(i, u, atom) + if (!other || other == i) + return i; + + return NULL; +} + +int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array) { + _cleanup_free_ Unit **array = NULL; + size_t n = 0; + Unit *other; + + assert(u); + assert(ret_array); + + /* Gets a list of units matching a specific atom as array. This is useful when iterating through + * dependencies while modifying them: the array is an "atomic snapshot" of sorts, that can be read + * while the dependency table is continuously updated. */ + + UNIT_FOREACH_DEPENDENCY(other, u, atom) { + if (!GREEDY_REALLOC(array, n + 1)) + return -ENOMEM; + + array[n++] = other; + } + + *ret_array = TAKE_PTR(array); + + assert(n <= INT_MAX); + return (int) n; +} + +int unit_get_transitive_dependency_set(Unit *u, UnitDependencyAtom atom, Set **ret) { + _cleanup_set_free_ Set *units = NULL, *queue = NULL; + Unit *other; + int r; + + assert(u); + assert(ret); + + /* Similar to unit_get_dependency_array(), but also search the same dependency in other units. */ + + do { + UNIT_FOREACH_DEPENDENCY(other, u, atom) { + r = set_ensure_put(&units, NULL, other); + if (r < 0) + return r; + if (r == 0) + continue; + r = set_ensure_put(&queue, NULL, other); + if (r < 0) + return r; + } + } while ((u = set_steal_first(queue))); + + *ret = TAKE_PTR(units); + return 0; +} + +int unit_arm_timer( + Unit *u, + sd_event_source **source, + bool relative, + usec_t usec, + sd_event_time_handler_t handler) { + + int r; + + assert(u); + assert(source); + assert(handler); + + if (*source) { + if (usec == USEC_INFINITY) + return sd_event_source_set_enabled(*source, SD_EVENT_OFF); + + r = (relative ? sd_event_source_set_time_relative : sd_event_source_set_time)(*source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(*source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = (relative ? sd_event_add_time_relative : sd_event_add_time)( + u->manager->event, + source, + CLOCK_MONOTONIC, + usec, 0, + handler, + u); + if (r < 0) + return r; + + const char *d = strjoina(unit_type_to_string(u->type), "-timer"); + (void) sd_event_source_set_description(*source, d); + + return 0; +} + +static int unit_get_nice(Unit *u) { + ExecContext *ec; + + ec = unit_get_exec_context(u); + return ec ? ec->nice : 0; +} + +static uint64_t unit_get_cpu_weight(Unit *u) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(u); + return cc ? cgroup_context_cpu_weight(cc, manager_state(u->manager)) : CGROUP_WEIGHT_DEFAULT; +} + +int unit_compare_priority(Unit *a, Unit *b) { + int ret; + + ret = CMP(a->type, b->type); + if (ret != 0) + return -ret; + + ret = CMP(unit_get_cpu_weight(a), unit_get_cpu_weight(b)); + if (ret != 0) + return -ret; + + ret = CMP(unit_get_nice(a), unit_get_nice(b)); + if (ret != 0) + return ret; + + return strcmp(a->id, b->id); +} + +const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX] = { + [UNIT_PATH] = &activation_details_path_vtable, + [UNIT_TIMER] = &activation_details_timer_vtable, +}; + +ActivationDetails *activation_details_new(Unit *trigger_unit) { + _cleanup_free_ ActivationDetails *details = NULL; + + assert(trigger_unit); + assert(trigger_unit->type != _UNIT_TYPE_INVALID); + assert(trigger_unit->id); + + details = malloc0(activation_details_vtable[trigger_unit->type]->object_size); + if (!details) + return NULL; + + *details = (ActivationDetails) { + .n_ref = 1, + .trigger_unit_type = trigger_unit->type, + }; + + details->trigger_unit_name = strdup(trigger_unit->id); + if (!details->trigger_unit_name) + return NULL; + + if (ACTIVATION_DETAILS_VTABLE(details)->init) + ACTIVATION_DETAILS_VTABLE(details)->init(details, trigger_unit); + + return TAKE_PTR(details); +} + +static ActivationDetails *activation_details_free(ActivationDetails *details) { + if (!details) + return NULL; + + if (ACTIVATION_DETAILS_VTABLE(details)->done) + ACTIVATION_DETAILS_VTABLE(details)->done(details); + + free(details->trigger_unit_name); + + return mfree(details); +} + +void activation_details_serialize(ActivationDetails *details, FILE *f) { + if (!details || details->trigger_unit_type == _UNIT_TYPE_INVALID) + return; + + (void) serialize_item(f, "activation-details-unit-type", unit_type_to_string(details->trigger_unit_type)); + if (details->trigger_unit_name) + (void) serialize_item(f, "activation-details-unit-name", details->trigger_unit_name); + if (ACTIVATION_DETAILS_VTABLE(details)->serialize) + ACTIVATION_DETAILS_VTABLE(details)->serialize(details, f); +} + +int activation_details_deserialize(const char *key, const char *value, ActivationDetails **details) { + int r; + + assert(key); + assert(value); + assert(details); + + if (!*details) { + UnitType t; + + if (!streq(key, "activation-details-unit-type")) + return -EINVAL; + + t = unit_type_from_string(value); + if (t < 0) + return t; + + /* The activation details vtable has defined ops only for path and timer units */ + if (!activation_details_vtable[t]) + return -EINVAL; + + *details = malloc0(activation_details_vtable[t]->object_size); + if (!*details) + return -ENOMEM; + + **details = (ActivationDetails) { + .n_ref = 1, + .trigger_unit_type = t, + }; + + return 0; + } + + if (streq(key, "activation-details-unit-name")) { + r = free_and_strdup(&(*details)->trigger_unit_name, value); + if (r < 0) + return r; + + return 0; + } + + if (ACTIVATION_DETAILS_VTABLE(*details)->deserialize) + return ACTIVATION_DETAILS_VTABLE(*details)->deserialize(key, value, details); + + return -EINVAL; +} + +int activation_details_append_env(ActivationDetails *details, char ***strv) { + int r = 0; + + assert(strv); + + if (!details) + return 0; + + if (!isempty(details->trigger_unit_name)) { + char *s = strjoin("TRIGGER_UNIT=", details->trigger_unit_name); + if (!s) + return -ENOMEM; + + r = strv_consume(strv, TAKE_PTR(s)); + if (r < 0) + return r; + } + + if (ACTIVATION_DETAILS_VTABLE(details)->append_env) { + r = ACTIVATION_DETAILS_VTABLE(details)->append_env(details, strv); + if (r < 0) + return r; + } + + return r + !isempty(details->trigger_unit_name); /* Return the number of variables added to the env block */ +} + +int activation_details_append_pair(ActivationDetails *details, char ***strv) { + int r = 0; + + assert(strv); + + if (!details) + return 0; + + if (!isempty(details->trigger_unit_name)) { + r = strv_extend(strv, "trigger_unit"); + if (r < 0) + return r; + + r = strv_extend(strv, details->trigger_unit_name); + if (r < 0) + return r; + } + + if (ACTIVATION_DETAILS_VTABLE(details)->append_pair) { + r = ACTIVATION_DETAILS_VTABLE(details)->append_pair(details, strv); + if (r < 0) + return r; + } + + return r + !isempty(details->trigger_unit_name); /* Return the number of pairs added to the strv */ +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(ActivationDetails, activation_details, activation_details_free); diff --git a/src/core/unit.h b/src/core/unit.h new file mode 100644 index 0000000..60bc2e3 --- /dev/null +++ b/src/core/unit.h @@ -0,0 +1,1249 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "bpf-program.h" +#include "cgroup.h" +#include "condition.h" +#include "emergency-action.h" +#include "install.h" +#include "list.h" +#include "pidref.h" +#include "set.h" +#include "show-status.h" +#include "unit-file.h" + +typedef struct UnitRef UnitRef; + +typedef enum KillOperation { + KILL_TERMINATE, + KILL_TERMINATE_AND_LOG, + KILL_RESTART, + KILL_KILL, + KILL_WATCHDOG, + _KILL_OPERATION_MAX, + _KILL_OPERATION_INVALID = -EINVAL, +} KillOperation; + +typedef enum CollectMode { + COLLECT_INACTIVE, + COLLECT_INACTIVE_OR_FAILED, + _COLLECT_MODE_MAX, + _COLLECT_MODE_INVALID = -EINVAL, +} CollectMode; + +static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) { + return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING); +} + +static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) { + return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING); +} + +static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) { + return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING); +} + +static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) { + return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED); +} + +static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) { + return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED; +} + +/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We + * use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be + * created as a result of multiple "reasons", hence the bitmask. */ +typedef enum UnitDependencyMask { + /* Configured directly by the unit file, .wants/.requires symlink or drop-in, or as an immediate result of a + * non-dependency option configured that way. */ + UNIT_DEPENDENCY_FILE = 1 << 0, + + /* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and + * type) */ + UNIT_DEPENDENCY_IMPLICIT = 1 << 1, + + /* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually + * just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only + * be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */ + UNIT_DEPENDENCY_DEFAULT = 1 << 2, + + /* A dependency created from udev rules */ + UNIT_DEPENDENCY_UDEV = 1 << 3, + + /* A dependency created because of some unit's RequiresMountsFor= setting */ + UNIT_DEPENDENCY_PATH = 1 << 4, + + /* A dependency initially configured from the mount unit file however the dependency will be updated + * from /proc/self/mountinfo as soon as the kernel will make the entry for that mount available in + * the /proc file */ + UNIT_DEPENDENCY_MOUNT_FILE = 1 << 5, + + /* A dependency created or updated because of data read from /proc/self/mountinfo */ + UNIT_DEPENDENCY_MOUNTINFO = 1 << 6, + + /* A dependency created because of data read from /proc/swaps and no other configuration source */ + UNIT_DEPENDENCY_PROC_SWAP = 1 << 7, + + /* A dependency for units in slices assigned by directly setting Slice= */ + UNIT_DEPENDENCY_SLICE_PROPERTY = 1 << 8, + + _UNIT_DEPENDENCY_MASK_FULL = (1 << 9) - 1, +} UnitDependencyMask; + +/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can + * be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin + * and the destination of a dependency might have created it. */ +typedef union UnitDependencyInfo { + void *data; + struct { + UnitDependencyMask origin_mask:16; + UnitDependencyMask destination_mask:16; + } _packed_; +} UnitDependencyInfo; + +/* Store information about why a unit was activated. + * We start with trigger units (.path/.timer), eventually it will be expanded to include more metadata. */ +typedef struct ActivationDetails { + unsigned n_ref; + UnitType trigger_unit_type; + char *trigger_unit_name; +} ActivationDetails; + +/* For casting an activation event into the various unit-specific types */ +#define DEFINE_ACTIVATION_DETAILS_CAST(UPPERCASE, MixedCase, UNIT_TYPE) \ + static inline MixedCase* UPPERCASE(ActivationDetails *a) { \ + if (_unlikely_(!a || a->trigger_unit_type != UNIT_##UNIT_TYPE)) \ + return NULL; \ + \ + return (MixedCase*) a; \ + } + +/* For casting the various unit types into a unit */ +#define ACTIVATION_DETAILS(u) \ + ({ \ + typeof(u) _u_ = (u); \ + ActivationDetails *_w_ = _u_ ? &(_u_)->meta : NULL; \ + _w_; \ + }) + +ActivationDetails *activation_details_new(Unit *trigger_unit); +ActivationDetails *activation_details_ref(ActivationDetails *p); +ActivationDetails *activation_details_unref(ActivationDetails *p); +void activation_details_serialize(ActivationDetails *p, FILE *f); +int activation_details_deserialize(const char *key, const char *value, ActivationDetails **info); +int activation_details_append_env(ActivationDetails *info, char ***strv); +int activation_details_append_pair(ActivationDetails *info, char ***strv); +DEFINE_TRIVIAL_CLEANUP_FUNC(ActivationDetails*, activation_details_unref); + +typedef struct ActivationDetailsVTable { + /* How much memory does an object of this activation type need */ + size_t object_size; + + /* This should reset all type-specific variables. This should not allocate memory, and is called + * with zero-initialized data. It should hence only initialize variables that need to be set != 0. */ + void (*init)(ActivationDetails *info, Unit *trigger_unit); + + /* This should free all type-specific variables. It should be idempotent. */ + void (*done)(ActivationDetails *info); + + /* This should serialize all type-specific variables. */ + void (*serialize)(ActivationDetails *info, FILE *f); + + /* This should deserialize all type-specific variables, one at a time. */ + int (*deserialize)(const char *key, const char *value, ActivationDetails **info); + + /* This should format the type-specific variables for the env block of the spawned service, + * and return the number of added items. */ + int (*append_env)(ActivationDetails *info, char ***strv); + + /* This should append type-specific variables as key/value pairs for the D-Bus property of the job, + * and return the number of added pairs. */ + int (*append_pair)(ActivationDetails *info, char ***strv); +} ActivationDetailsVTable; + +extern const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX]; + +static inline const ActivationDetailsVTable* ACTIVATION_DETAILS_VTABLE(const ActivationDetails *a) { + assert(a); + assert(a->trigger_unit_type < _UNIT_TYPE_MAX); + + return activation_details_vtable[a->trigger_unit_type]; +} + +/* Newer LLVM versions don't like implicit casts from large pointer types to smaller enums, hence let's add + * explicit type-safe helpers for that. */ +static inline UnitDependency UNIT_DEPENDENCY_FROM_PTR(const void *p) { + return PTR_TO_INT(p); +} + +static inline void* UNIT_DEPENDENCY_TO_PTR(UnitDependency d) { + return INT_TO_PTR(d); +} + +#include "job.h" + +struct UnitRef { + /* Keeps tracks of references to a unit. This is useful so + * that we can merge two units if necessary and correct all + * references to them */ + + Unit *source, *target; + LIST_FIELDS(UnitRef, refs_by_target); +}; + +typedef struct Unit { + Manager *manager; + + UnitType type; + UnitLoadState load_state; + Unit *merged_into; + + char *id; /* The one special name that we use for identification */ + char *instance; + + Set *aliases; /* All the other names. */ + + /* For each dependency type we can look up another Hashmap with this, whose key is a Unit* object, + * and whose value encodes why the dependency exists, using the UnitDependencyInfo type. i.e. a + * Hashmap(UnitDependency → Hashmap(Unit* → UnitDependencyInfo)) */ + Hashmap *dependencies; + + /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the + * UnitDependencyInfo type */ + Hashmap *requires_mounts_for; + + char *description; + char **documentation; + + /* The SELinux context used for checking access to this unit read off the unit file at load time (do + * not confuse with the selinux_context field in ExecContext which is the SELinux context we'll set + * for processes) */ + char *access_selinux_context; + + char *fragment_path; /* if loaded from a config file this is the primary path to it */ + char *source_path; /* if converted, the source file */ + char **dropin_paths; + + usec_t fragment_not_found_timestamp_hash; + usec_t fragment_mtime; + usec_t source_mtime; + usec_t dropin_mtime; + + /* If this is a transient unit we are currently writing, this is where we are writing it to */ + FILE *transient_file; + + /* Freezer state */ + sd_bus_message *pending_freezer_invocation; + FreezerState freezer_state; + + /* Job timeout and action to take */ + EmergencyAction job_timeout_action; + usec_t job_timeout; + usec_t job_running_timeout; + char *job_timeout_reboot_arg; + + /* If there is something to do with this unit, then this is the installed job for it */ + Job *job; + + /* JOB_NOP jobs are special and can be installed without disturbing the real job. */ + Job *nop_job; + + /* The slot used for watching NameOwnerChanged signals */ + sd_bus_slot *match_bus_slot; + sd_bus_slot *get_name_owner_slot; + + /* References to this unit from clients */ + sd_bus_track *bus_track; + char **deserialized_refs; + + /* References to this */ + LIST_HEAD(UnitRef, refs_by_target); + + /* Conditions to check */ + LIST_HEAD(Condition, conditions); + LIST_HEAD(Condition, asserts); + + dual_timestamp condition_timestamp; + dual_timestamp assert_timestamp; + + /* Updated whenever the low-level state changes */ + dual_timestamp state_change_timestamp; + + /* Updated whenever the (high-level) active state enters or leaves the active or inactive states */ + dual_timestamp inactive_exit_timestamp; + dual_timestamp active_enter_timestamp; + dual_timestamp active_exit_timestamp; + dual_timestamp inactive_enter_timestamp; + + /* Per type list */ + LIST_FIELDS(Unit, units_by_type); + + /* Load queue */ + LIST_FIELDS(Unit, load_queue); + + /* D-Bus queue */ + LIST_FIELDS(Unit, dbus_queue); + + /* Cleanup queue */ + LIST_FIELDS(Unit, cleanup_queue); + + /* GC queue */ + LIST_FIELDS(Unit, gc_queue); + + /* CGroup realize members queue */ + LIST_FIELDS(Unit, cgroup_realize_queue); + + /* cgroup empty queue */ + LIST_FIELDS(Unit, cgroup_empty_queue); + + /* cgroup OOM queue */ + LIST_FIELDS(Unit, cgroup_oom_queue); + + /* Target dependencies queue */ + LIST_FIELDS(Unit, target_deps_queue); + + /* Queue of units with StopWhenUnneeded= set that shall be checked for clean-up. */ + LIST_FIELDS(Unit, stop_when_unneeded_queue); + + /* Queue of units that have an Uphold= dependency from some other unit, and should be checked for starting */ + LIST_FIELDS(Unit, start_when_upheld_queue); + + /* Queue of units that have a BindTo= dependency on some other unit, and should possibly be shut down */ + LIST_FIELDS(Unit, stop_when_bound_queue); + + /* Queue of units that should be checked if they can release resources now */ + LIST_FIELDS(Unit, release_resources_queue); + + /* PIDs we keep an eye on. Note that a unit might have many more, but these are the ones we care + * enough about to process SIGCHLD for */ + Set *pids; /* → PidRef* */ + + /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; + + /* Used during GC sweeps */ + unsigned gc_marker; + + /* Error code when we didn't manage to load the unit (negative) */ + int load_error; + + /* Put a ratelimit on unit starting */ + RateLimit start_ratelimit; + EmergencyAction start_limit_action; + + /* The unit has been marked for reload, restart, etc. Stored as 1u << marker1 | 1u << marker2. */ + unsigned markers; + + /* What to do on failure or success */ + EmergencyAction success_action, failure_action; + int success_action_exit_status, failure_action_exit_status; + char *reboot_arg; + + /* Make sure we never enter endless loops with the StopWhenUnneeded=, BindsTo=, Uphold= logic */ + RateLimit auto_start_stop_ratelimit; + sd_event_source *auto_start_stop_event_source; + + /* Reference to a specific UID/GID */ + uid_t ref_uid; + gid_t ref_gid; + + /* Cached unit file state and preset */ + UnitFileState unit_file_state; + PresetAction unit_file_preset; + + /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */ + nsec_t cpu_usage_base; + nsec_t cpu_usage_last; /* the most recently read value */ + + /* Most recently read value of memory accounting metrics */ + uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1]; + + /* The current counter of OOM kills initiated by systemd-oomd */ + uint64_t managed_oom_kill_last; + + /* The current counter of the oom_kill field in the memory.events cgroup attribute */ + uint64_t oom_kill_last; + + /* Where the io.stat data was at the time the unit was started */ + uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; + uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */ + + /* Counterparts in the cgroup filesystem */ + char *cgroup_path; + uint64_t cgroup_id; + CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ + CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ + CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */ + CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ + + /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */ + int cgroup_control_inotify_wd; + int cgroup_memory_inotify_wd; + + /* Device Controller BPF program */ + BPFProgram *bpf_device_control_installed; + + /* IP BPF Firewalling/accounting */ + int ip_accounting_ingress_map_fd; + int ip_accounting_egress_map_fd; + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + + int ipv4_allow_map_fd; + int ipv6_allow_map_fd; + int ipv4_deny_map_fd; + int ipv6_deny_map_fd; + BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed; + BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed; + + Set *ip_bpf_custom_ingress; + Set *ip_bpf_custom_ingress_installed; + Set *ip_bpf_custom_egress; + Set *ip_bpf_custom_egress_installed; + + /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd, + * attached to unit cgroup by provided program fd and attach type. */ + Hashmap *bpf_foreign_by_key; + + FDSet *initial_socket_bind_link_fds; +#if BPF_FRAMEWORK + /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and + * responsible for allowing or denying a unit to bind(2) to a socket + * address. */ + struct bpf_link *ipv4_socket_bind_link; + struct bpf_link *ipv6_socket_bind_link; +#endif + + FDSet *initial_restric_ifaces_link_fds; +#if BPF_FRAMEWORK + struct bpf_link *restrict_ifaces_ingress_bpf_link; + struct bpf_link *restrict_ifaces_egress_bpf_link; +#endif + + /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new + * ones which might have appeared. */ + sd_event_source *rewatch_pids_event_source; + + /* How to start OnSuccess=/OnFailure= units */ + JobMode on_success_job_mode; + JobMode on_failure_job_mode; + + /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */ + ActivationDetails *activation_details; + + /* Tweaking the GC logic */ + CollectMode collect_mode; + + /* The current invocation ID */ + sd_id128_t invocation_id; + char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */ + + /* Garbage collect us we nobody wants or requires us anymore */ + bool stop_when_unneeded; + + /* Create default dependencies */ + bool default_dependencies; + + /* Configure so that the unit survives a system transition without stopping/starting. */ + bool survive_final_kill_signal; + + /* Refuse manual starting, allow starting only indirectly via dependency. */ + bool refuse_manual_start; + + /* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */ + bool refuse_manual_stop; + + /* Allow isolation requests */ + bool allow_isolate; + + /* Ignore this unit when isolating */ + bool ignore_on_isolate; + + /* Did the last condition check succeed? */ + bool condition_result; + bool assert_result; + + /* Is this a transient unit? */ + bool transient; + + /* Is this a unit that is always running and cannot be stopped? */ + bool perpetual; + + /* Booleans indicating membership of this unit in the various queues */ + bool in_load_queue:1; + bool in_dbus_queue:1; + bool in_cleanup_queue:1; + bool in_gc_queue:1; + bool in_cgroup_realize_queue:1; + bool in_cgroup_empty_queue:1; + bool in_cgroup_oom_queue:1; + bool in_target_deps_queue:1; + bool in_stop_when_unneeded_queue:1; + bool in_start_when_upheld_queue:1; + bool in_stop_when_bound_queue:1; + bool in_release_resources_queue:1; + + bool sent_dbus_new_signal:1; + + bool job_running_timeout_set:1; + + bool in_audit:1; + bool on_console:1; + + bool cgroup_realized:1; + bool cgroup_members_mask_valid:1; + + /* Reset cgroup accounting next time we fork something off */ + bool reset_accounting:1; + + bool start_limit_hit:1; + + /* Did we already invoke unit_coldplug() for this unit? */ + bool coldplugged:1; + + /* For transient units: whether to add a bus track reference after creating the unit */ + bool bus_track_add:1; + + /* Remember which unit state files we created */ + bool exported_invocation_id:1; + bool exported_log_level_max:1; + bool exported_log_extra_fields:1; + bool exported_log_ratelimit_interval:1; + bool exported_log_ratelimit_burst:1; + + /* Whether we warned about clamping the CPU quota period */ + bool warned_clamping_cpu_quota_period:1; + + /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If + * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */ + signed int last_section_private:2; +} Unit; + +typedef struct UnitStatusMessageFormats { + const char *starting_stopping[2]; + const char *finished_start_job[_JOB_RESULT_MAX]; + const char *finished_stop_job[_JOB_RESULT_MAX]; + /* If this entry is present, it'll be called to provide a context-dependent format string, + * or NULL to fall back to finished_{start,stop}_job; if those are NULL too, fall back to generic. */ + const char *(*finished_job)(Unit *u, JobType t, JobResult result); +} UnitStatusMessageFormats; + +/* Flags used when writing drop-in files or transient unit files */ +typedef enum UnitWriteFlags { + /* Write a runtime unit file or drop-in (i.e. one below /run) */ + UNIT_RUNTIME = 1 << 0, + + /* Write a persistent drop-in (i.e. one below /etc) */ + UNIT_PERSISTENT = 1 << 1, + + /* Place this item in the per-unit-type private section, instead of [Unit] */ + UNIT_PRIVATE = 1 << 2, + + /* Apply specifier escaping */ + UNIT_ESCAPE_SPECIFIERS = 1 << 3, + + /* Escape elements of ExecStart= syntax, incl. prevention of variable expansion */ + UNIT_ESCAPE_EXEC_SYNTAX_ENV = 1 << 4, + + /* Escape elements of ExecStart=: syntax (no variable expansion) */ + UNIT_ESCAPE_EXEC_SYNTAX = 1 << 5, + + /* Apply C escaping before writing */ + UNIT_ESCAPE_C = 1 << 6, +} UnitWriteFlags; + +/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */ +static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) { + return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0; +} + +#include "kill.h" + +typedef struct UnitVTable { + /* How much memory does an object of this unit type need */ + size_t object_size; + + /* If greater than 0, the offset into the object where + * ExecContext is found, if the unit type has that */ + size_t exec_context_offset; + + /* If greater than 0, the offset into the object where + * CGroupContext is found, if the unit type has that */ + size_t cgroup_context_offset; + + /* If greater than 0, the offset into the object where + * KillContext is found, if the unit type has that */ + size_t kill_context_offset; + + /* If greater than 0, the offset into the object where the + * pointer to ExecSharedRuntime is found, if the unit type has + * that */ + size_t exec_runtime_offset; + + /* The name of the configuration file section with the private settings of this unit */ + const char *private_section; + + /* Config file sections this unit type understands, separated + * by NUL chars */ + const char *sections; + + /* This should reset all type-specific variables. This should + * not allocate memory, and is called with zero-initialized + * data. It should hence only initialize variables that need + * to be set != 0. */ + void (*init)(Unit *u); + + /* This should free all type-specific variables. It should be + * idempotent. */ + void (*done)(Unit *u); + + /* Actually load data from disk. This may fail, and should set + * load_state to UNIT_LOADED, UNIT_MERGED or leave it at + * UNIT_STUB if no configuration could be found. */ + int (*load)(Unit *u); + + /* During deserialization we only record the intended state to return to. With coldplug() we actually put the + * deserialized state in effect. This is where unit_notify() should be called to start things up. Note that + * this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the + * reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was + * in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's + * what catchup() below is for. */ + int (*coldplug)(Unit *u); + + /* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the + * reloading state. It's supposed to catch up with state changes due to external events we missed so far (for + * example because they took place while we were reloading/reexecing) */ + void (*catchup)(Unit *u); + + void (*dump)(Unit *u, FILE *f, const char *prefix); + + int (*start)(Unit *u); + int (*stop)(Unit *u); + int (*reload)(Unit *u); + + /* Clear out the various runtime/state/cache/logs/configuration data */ + int (*clean)(Unit *u, ExecCleanMask m); + + /* Freeze the unit */ + int (*freeze)(Unit *u); + int (*thaw)(Unit *u); + bool (*can_freeze)(Unit *u); + + /* Return which kind of data can be cleaned */ + int (*can_clean)(Unit *u, ExecCleanMask *ret); + + bool (*can_reload)(Unit *u); + + /* Serialize state and file descriptors that should be carried over into the new + * instance after reexecution. */ + int (*serialize)(Unit *u, FILE *f, FDSet *fds); + + /* Restore one item from the serialization */ + int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds); + + /* Try to match up fds with what we need for this unit */ + void (*distribute_fds)(Unit *u, FDSet *fds); + + /* Boils down the more complex internal state of this unit to + * a simpler one that the engine can understand */ + UnitActiveState (*active_state)(Unit *u); + + /* Returns the substate specific to this unit type as + * string. This is purely information so that we can give the + * user a more fine grained explanation in which actual state a + * unit is in. */ + const char* (*sub_state_to_string)(Unit *u); + + /* Additionally to UnitActiveState determine whether unit is to be restarted. */ + bool (*will_restart)(Unit *u); + + /* Return false when there is a reason to prevent this unit from being gc'ed + * even though nothing references it and it isn't active in any way. */ + bool (*may_gc)(Unit *u); + + /* Return true when the unit is not controlled by the manager (e.g. extrinsic mounts). */ + bool (*is_extrinsic)(Unit *u); + + /* When the unit is not running and no job for it queued we shall release its runtime resources */ + void (*release_resources)(Unit *u); + + /* Invoked on every child that died */ + void (*sigchld_event)(Unit *u, pid_t pid, int code, int status); + + /* Reset failed state if we are in failed state */ + void (*reset_failed)(Unit *u); + + /* Called whenever any of the cgroups this unit watches for ran empty */ + void (*notify_cgroup_empty)(Unit *u); + + /* Called whenever an OOM kill event on this unit was seen */ + void (*notify_cgroup_oom)(Unit *u, bool managed_oom); + + /* Called whenever a process of this unit sends us a message */ + void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds); + + /* Called whenever a name this Unit registered for comes or goes away. */ + void (*bus_name_owner_change)(Unit *u, const char *new_owner); + + /* Called for each property that is being set */ + int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + + /* Called after at least one property got changed to apply the necessary change */ + int (*bus_commit_properties)(Unit *u); + + /* Return the unit this unit is following */ + Unit *(*following)(Unit *u); + + /* Return the set of units that are following each other */ + int (*following_set)(Unit *u, Set **s); + + /* Invoked each time a unit this unit is triggering changes + * state or gains/loses a job */ + void (*trigger_notify)(Unit *u, Unit *trigger); + + /* Called whenever CLOCK_REALTIME made a jump */ + void (*time_change)(Unit *u); + + /* Called whenever /etc/localtime was modified */ + void (*timezone_change)(Unit *u); + + /* Returns the next timeout of a unit */ + int (*get_timeout)(Unit *u, usec_t *timeout); + + /* Returns the start timeout of a unit */ + usec_t (*get_timeout_start_usec)(Unit *u); + + /* Returns the main PID if there is any defined, or 0. */ + PidRef* (*main_pid)(Unit *u); + + /* Returns the control PID if there is any defined, or 0. */ + PidRef* (*control_pid)(Unit *u); + + /* Returns true if the unit currently needs access to the console */ + bool (*needs_console)(Unit *u); + + /* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the + * exit code of the "main" process of the service or similar. */ + int (*exit_status)(Unit *u); + + /* Return a copy of the status string pointer. */ + const char* (*status_text)(Unit *u); + + /* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that + * unconditionally exist and are always active. The main reason to keep both enumeration functions separate is + * philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those + * discovered through regular enumeration should be put in place by catchup(), see below. */ + void (*enumerate_perpetual)(Manager *m); + + /* This is called for each unit type and should be used to enumerate units already existing in the system + * internally and load them. However, everything that is loaded here should still stay in inactive state. It is + * the job of the catchup() call above to put the units into the discovered state. */ + void (*enumerate)(Manager *m); + + /* Type specific cleanups. */ + void (*shutdown)(Manager *m); + + /* If this function is set and returns false all jobs for units + * of this type will immediately fail. */ + bool (*supported)(void); + + /* If this function is set, it's invoked first as part of starting a unit to allow start rate + * limiting checks to occur before we do anything else. */ + int (*can_start)(Unit *u); + + /* Returns > 0 if the whole subsystem is ratelimited, and new start operations should not be started + * for this unit type right now. */ + int (*subsystem_ratelimited)(Manager *m); + + /* The strings to print in status messages */ + UnitStatusMessageFormats status_message_formats; + + /* True if transient units of this type are OK */ + bool can_transient; + + /* True if cgroup delegation is permissible */ + bool can_delegate; + + /* True if the unit type triggers other units, i.e. can have a UNIT_TRIGGERS dependency */ + bool can_trigger; + + /* True if the unit type knows a failure state, and thus can be source of an OnFailure= dependency */ + bool can_fail; + + /* True if units of this type shall be startable only once and then never again */ + bool once_only; + + /* Do not serialize this unit when preparing for root switch */ + bool exclude_from_switch_root_serialization; + + /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */ + bool gc_jobs; + + /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroups */ + bool can_set_managed_oom; + + /* If true, we'll notify plymouth about this unit */ + bool notify_plymouth; + + /* The audit events to generate on start + stop (or 0 if none shall be generated) */ + int audit_start_message_type; + int audit_stop_message_type; +} UnitVTable; + +extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX]; + +static inline const UnitVTable* UNIT_VTABLE(const Unit *u) { + return unit_vtable[u->type]; +} + +/* For casting a unit into the various unit types */ +#define DEFINE_CAST(UPPERCASE, MixedCase) \ + static inline MixedCase* UPPERCASE(Unit *u) { \ + if (_unlikely_(!u || u->type != UNIT_##UPPERCASE)) \ + return NULL; \ + \ + return (MixedCase*) u; \ + } + +/* For casting the various unit types into a unit */ +#define UNIT(u) \ + ({ \ + typeof(u) _u_ = (u); \ + Unit *_w_ = _u_ ? &(_u_)->meta : NULL; \ + _w_; \ + }) + +#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0) +#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0) +#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0) + +Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other); +int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array); +int unit_get_transitive_dependency_set(Unit *u, UnitDependencyAtom atom, Set **ret); + +static inline Hashmap* unit_get_dependencies(Unit *u, UnitDependency d) { + return hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d)); +} + +static inline Unit* UNIT_TRIGGER(Unit *u) { + return unit_has_dependency(u, UNIT_ATOM_TRIGGERS, NULL); +} + +static inline Unit* UNIT_GET_SLICE(const Unit *u) { + return unit_has_dependency(u, UNIT_ATOM_IN_SLICE, NULL); +} + +Unit* unit_new(Manager *m, size_t size); +Unit* unit_free(Unit *u); +DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free); + +int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret); +int unit_add_name(Unit *u, const char *name); + +int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask); +int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask); + +int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask); +int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask); + +int unit_add_exec_dependencies(Unit *u, ExecContext *c); + +int unit_choose_id(Unit *u, const char *name); +int unit_set_description(Unit *u, const char *description); + +void unit_release_resources(Unit *u); + +bool unit_may_gc(Unit *u); + +static inline bool unit_is_extrinsic(Unit *u) { + return u->perpetual || + (UNIT_VTABLE(u)->is_extrinsic && UNIT_VTABLE(u)->is_extrinsic(u)); +} + +static inline const char* unit_status_text(Unit *u) { + if (u && UNIT_VTABLE(u)->status_text) + return UNIT_VTABLE(u)->status_text(u); + return NULL; +} + +void unit_add_to_load_queue(Unit *u); +void unit_add_to_dbus_queue(Unit *u); +void unit_add_to_cleanup_queue(Unit *u); +void unit_add_to_gc_queue(Unit *u); +void unit_add_to_target_deps_queue(Unit *u); +void unit_submit_to_stop_when_unneeded_queue(Unit *u); +void unit_submit_to_start_when_upheld_queue(Unit *u); +void unit_submit_to_stop_when_bound_queue(Unit *u); +void unit_submit_to_release_resources_queue(Unit *u); + +int unit_merge(Unit *u, Unit *other); +int unit_merge_by_name(Unit *u, const char *other); + +Unit *unit_follow_merge(Unit *u) _pure_; + +int unit_load_fragment_and_dropin(Unit *u, bool fragment_required); +int unit_load(Unit *unit); + +int unit_set_slice(Unit *u, Unit *slice); +int unit_set_default_slice(Unit *u); + +const char *unit_description(Unit *u) _pure_; +const char *unit_status_string(Unit *u, char **combined); + +bool unit_has_name(const Unit *u, const char *name); + +UnitActiveState unit_active_state(Unit *u); +FreezerState unit_freezer_state(Unit *u); +int unit_freezer_state_kernel(Unit *u, FreezerState *ret); + +const char* unit_sub_state_to_string(Unit *u); + +bool unit_can_reload(Unit *u) _pure_; +bool unit_can_start(Unit *u) _pure_; +bool unit_can_stop(Unit *u) _pure_; +bool unit_can_isolate(Unit *u) _pure_; + +int unit_start(Unit *u, ActivationDetails *details); +int unit_stop(Unit *u); +int unit_reload(Unit *u); + +int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *error); + +void unit_notify_cgroup_oom(Unit *u, bool managed_oom); + +void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success); + +int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive); +int unit_watch_pid(Unit *u, pid_t pid, bool exclusive); +void unit_unwatch_pidref(Unit *u, PidRef *pid); +void unit_unwatch_pid(Unit *u, pid_t pid); +void unit_unwatch_all_pids(Unit *u); + +int unit_enqueue_rewatch_pids(Unit *u); +void unit_dequeue_rewatch_pids(Unit *u); + +int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name); +int unit_watch_bus_name(Unit *u, const char *name); +void unit_unwatch_bus_name(Unit *u, const char *name); + +bool unit_job_is_applicable(Unit *u, JobType j); + +int set_unit_path(const char *p); + +char *unit_dbus_path(Unit *u); +char *unit_dbus_path_invocation_id(Unit *u); + +int unit_load_related_unit(Unit *u, const char *type, Unit **_found); + +int unit_add_node_dependency(Unit *u, const char *what, UnitDependency d, UnitDependencyMask mask); +int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask); + +int unit_coldplug(Unit *u); +void unit_catchup(Unit *u); + +void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) _printf_(4, 0); + +bool unit_need_daemon_reload(Unit *u); + +void unit_reset_failed(Unit *u); + +Unit *unit_following(Unit *u); +int unit_following_set(Unit *u, Set **s); + +const char *unit_slice_name(Unit *u); + +bool unit_stop_pending(Unit *u) _pure_; +bool unit_inactive_or_pending(Unit *u) _pure_; +bool unit_active_or_pending(Unit *u); +bool unit_will_restart_default(Unit *u); +bool unit_will_restart(Unit *u); + +int unit_add_default_target_dependency(Unit *u, Unit *target); + +void unit_start_on_failure(Unit *u, const char *dependency_name, UnitDependencyAtom atom, JobMode job_mode); +void unit_trigger_notify(Unit *u); + +UnitFileState unit_get_unit_file_state(Unit *u); +PresetAction unit_get_unit_file_preset(Unit *u); + +Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target); +void unit_ref_unset(UnitRef *ref); + +#define UNIT_DEREF(ref) ((ref).target) +#define UNIT_ISSET(ref) (!!(ref).target) + +int unit_patch_contexts(Unit *u); + +ExecContext *unit_get_exec_context(const Unit *u) _pure_; +KillContext *unit_get_kill_context(Unit *u) _pure_; +CGroupContext *unit_get_cgroup_context(Unit *u) _pure_; + +ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_; + +int unit_setup_exec_runtime(Unit *u); + +const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf); +char* unit_concat_strv(char **l, UnitWriteFlags flags); + +int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data); +int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5); + +int unit_kill_context(Unit *u, KillContext *c, KillOperation k, PidRef *main_pid, PidRef *control_pid, bool main_pid_alien); + +int unit_make_transient(Unit *u); + +int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask); + +bool unit_type_supported(UnitType t); + +bool unit_is_pristine(Unit *u); + +bool unit_is_unneeded(Unit *u); +bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit); +bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit); + +PidRef* unit_control_pid(Unit *u); +PidRef* unit_main_pid(Unit *u); + +void unit_warn_if_dir_nonempty(Unit *u, const char* where); +int unit_fail_if_noncanonical(Unit *u, const char* where); + +int unit_test_start_limit(Unit *u); + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid); +void unit_unref_uid_gid(Unit *u, bool destroy_now); + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid); + +int unit_set_invocation_id(Unit *u, sd_id128_t id); +int unit_acquire_invocation_id(Unit *u); + +int unit_set_exec_params(Unit *s, ExecParameters *p); + +int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret); +int unit_fork_and_watch_rm_rf(Unit *u, char **paths, PidRef *ret); + +void unit_remove_dependencies(Unit *u, UnitDependencyMask mask); + +void unit_export_state_files(Unit *u); +void unit_unlink_state_files(Unit *u); + +int unit_prepare_exec(Unit *u); + +int unit_log_leftover_process_start(const PidRef* pid, int sig, void *userdata); +int unit_log_leftover_process_stop(const PidRef* pid, int sig, void *userdata); + +int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func); + +bool unit_needs_console(Unit *u); + +int unit_pid_attachable(Unit *unit, PidRef *pid, sd_bus_error *error); + +static inline bool unit_has_job_type(Unit *u, JobType type) { + return u && u->job && u->job->type == type; +} + +static inline bool unit_log_level_test(const Unit *u, int level) { + ExecContext *ec = unit_get_exec_context(u); + return !ec || ec->log_level_max < 0 || ec->log_level_max >= LOG_PRI(level); +} + +/* unit_log_skip is for cases like ExecCondition= where a unit is considered "done" + * after some execution, rather than succeeded or failed. */ +void unit_log_skip(Unit *u, const char *result); +void unit_log_success(Unit *u); +void unit_log_failure(Unit *u, const char *result); +static inline void unit_log_result(Unit *u, bool success, const char *result) { + if (success) + unit_log_success(u); + else + unit_log_failure(u, result); +} + +void unit_log_process_exit(Unit *u, const char *kind, const char *command, bool success, int code, int status); + +int unit_exit_status(Unit *u); +int unit_success_action_exit_status(Unit *u); +int unit_failure_action_exit_status(Unit *u); + +int unit_test_trigger_loaded(Unit *u); + +void unit_destroy_runtime_data(Unit *u, const ExecContext *context); +int unit_clean(Unit *u, ExecCleanMask mask); +int unit_can_clean(Unit *u, ExecCleanMask *ret_mask); + +bool unit_can_start_refuse_manual(Unit *u); +bool unit_can_stop_refuse_manual(Unit *u); +bool unit_can_isolate_refuse_manual(Unit *u); + +bool unit_can_freeze(Unit *u); +int unit_freeze(Unit *u); +void unit_frozen(Unit *u); + +int unit_thaw(Unit *u); +void unit_thawed(Unit *u); + +int unit_freeze_vtable_common(Unit *u); +int unit_thaw_vtable_common(Unit *u); + +Condition *unit_find_failed_condition(Unit *u); + +int unit_arm_timer(Unit *u, sd_event_source **source, bool relative, usec_t usec, sd_event_time_handler_t handler); + +int unit_compare_priority(Unit *a, Unit *b); + +/* Macros which append UNIT= or USER_UNIT= to the message */ + +#define log_unit_full_errno_zerook(unit, level, error, ...) \ + ({ \ + const Unit *_u = (unit); \ + const int _l = (level); \ + bool _do_log = !(log_get_max_level() < LOG_PRI(_l) || \ + (_u && !unit_log_level_test(_u, _l))); \ + const ExecContext *_c = _do_log && _u ? \ + unit_get_exec_context(_u) : NULL; \ + LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL, \ + _c ? _c->n_log_extra_fields : 0); \ + !_do_log ? -ERRNO_VALUE(error) : \ + _u ? log_object_internal(_l, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \ + log_internal(_l, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \ + }) + +#define log_unit_full_errno(unit, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_unit_full_errno_zerook(unit, level, _error, ##__VA_ARGS__); \ + }) + +#define log_unit_full(unit, level, ...) (void) log_unit_full_errno_zerook(unit, level, 0, __VA_ARGS__) + +#define log_unit_debug(unit, ...) log_unit_full(unit, LOG_DEBUG, __VA_ARGS__) +#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, __VA_ARGS__) +#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, __VA_ARGS__) +#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, __VA_ARGS__) +#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, __VA_ARGS__) + +#define log_unit_debug_errno(unit, error, ...) log_unit_full_errno(unit, LOG_DEBUG, error, __VA_ARGS__) +#define log_unit_info_errno(unit, error, ...) log_unit_full_errno(unit, LOG_INFO, error, __VA_ARGS__) +#define log_unit_notice_errno(unit, error, ...) log_unit_full_errno(unit, LOG_NOTICE, error, __VA_ARGS__) +#define log_unit_warning_errno(unit, error, ...) log_unit_full_errno(unit, LOG_WARNING, error, __VA_ARGS__) +#define log_unit_error_errno(unit, error, ...) log_unit_full_errno(unit, LOG_ERR, error, __VA_ARGS__) + +#if LOG_TRACE +# define log_unit_trace(...) log_unit_debug(__VA_ARGS__) +# define log_unit_trace_errno(...) log_unit_debug_errno(__VA_ARGS__) +#else +# define log_unit_trace(...) do {} while (0) +# define log_unit_trace_errno(e, ...) (-ERRNO_VALUE(e)) +#endif + +#define log_unit_struct_errno(unit, level, error, ...) \ + ({ \ + const Unit *_u = (unit); \ + const int _l = (level); \ + bool _do_log = unit_log_level_test(_u, _l); \ + const ExecContext *_c = _do_log && _u ? \ + unit_get_exec_context(_u) : NULL; \ + LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL, \ + _c ? _c->n_log_extra_fields : 0); \ + _do_log ? \ + log_struct_errno(_l, error, __VA_ARGS__, LOG_UNIT_ID(_u)) : \ + -ERRNO_VALUE(error); \ + }) + +#define log_unit_struct(unit, level, ...) log_unit_struct_errno(unit, level, 0, __VA_ARGS__) + +#define log_unit_struct_iovec_errno(unit, level, error, iovec, n_iovec) \ + ({ \ + const Unit *_u = (unit); \ + const int _l = (level); \ + bool _do_log = unit_log_level_test(_u, _l); \ + const ExecContext *_c = _do_log && _u ? \ + unit_get_exec_context(_u) : NULL; \ + LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL, \ + _c ? _c->n_log_extra_fields : 0); \ + _do_log ? \ + log_struct_iovec_errno(_l, error, iovec, n_iovec) : \ + -ERRNO_VALUE(error); \ + }) + +#define log_unit_struct_iovec(unit, level, iovec, n_iovec) log_unit_struct_iovec_errno(unit, level, 0, iovec, n_iovec) + +/* Like LOG_MESSAGE(), but with the unit name prefixed. */ +#define LOG_UNIT_MESSAGE(unit, fmt, ...) LOG_MESSAGE("%s: " fmt, (unit)->id, ##__VA_ARGS__) +#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id +#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string + +const char* collect_mode_to_string(CollectMode m) _const_; +CollectMode collect_mode_from_string(const char *s) _pure_; + +typedef struct UnitForEachDependencyData { + /* Stores state for the FOREACH macro below for iterating through all deps that have any of the + * specified dependency atom bits set */ + UnitDependencyAtom match_atom; + Hashmap *by_type, *by_unit; + void *current_type; + Iterator by_type_iterator, by_unit_iterator; + Unit **current_unit; +} UnitForEachDependencyData; + +/* Iterates through all dependencies that have a specific atom in the dependency type set. This tries to be + * smart: if the atom is unique, we'll directly go to right entry. Otherwise we'll iterate through the + * per-dependency type hashmap and match all dep that have the right atom set. */ +#define _UNIT_FOREACH_DEPENDENCY(other, u, ma, data) \ + for (UnitForEachDependencyData data = { \ + .match_atom = (ma), \ + .by_type = (u)->dependencies, \ + .by_type_iterator = ITERATOR_FIRST, \ + .current_unit = &(other), \ + }; \ + ({ \ + UnitDependency _dt = _UNIT_DEPENDENCY_INVALID; \ + bool _found; \ + \ + if (data.by_type && ITERATOR_IS_FIRST(data.by_type_iterator)) { \ + _dt = unit_dependency_from_unique_atom(data.match_atom); \ + if (_dt >= 0) { \ + data.by_unit = hashmap_get(data.by_type, UNIT_DEPENDENCY_TO_PTR(_dt)); \ + data.current_type = UNIT_DEPENDENCY_TO_PTR(_dt); \ + data.by_type = NULL; \ + _found = !!data.by_unit; \ + } \ + } \ + if (_dt < 0) \ + _found = hashmap_iterate(data.by_type, \ + &data.by_type_iterator, \ + (void**)&(data.by_unit), \ + (const void**) &(data.current_type)); \ + _found; \ + }); ) \ + if ((unit_dependency_to_atom(UNIT_DEPENDENCY_FROM_PTR(data.current_type)) & data.match_atom) != 0) \ + for (data.by_unit_iterator = ITERATOR_FIRST; \ + hashmap_iterate(data.by_unit, \ + &data.by_unit_iterator, \ + NULL, \ + (const void**) data.current_unit); ) + +/* Note: this matches deps that have *any* of the atoms specified in match_atom set */ +#define UNIT_FOREACH_DEPENDENCY(other, u, match_atom) \ + _UNIT_FOREACH_DEPENDENCY(other, u, match_atom, UNIQ_T(data, UNIQ)) + +#define _LOG_CONTEXT_PUSH_UNIT(unit, u, c) \ + const Unit *u = (unit); \ + const ExecContext *c = unit_get_exec_context(u); \ + LOG_CONTEXT_PUSH_KEY_VALUE(u->manager->unit_log_field, u->id); \ + LOG_CONTEXT_PUSH_KEY_VALUE(u->manager->invocation_log_field, u->invocation_id_string); \ + LOG_CONTEXT_PUSH_IOV(c ? c->log_extra_fields : NULL, c ? c->n_log_extra_fields : 0) + +#define LOG_CONTEXT_PUSH_UNIT(unit) \ + _LOG_CONTEXT_PUSH_UNIT(unit, UNIQ_T(u, UNIQ), UNIQ_T(c, UNIQ)) diff --git a/src/core/user.conf.in b/src/core/user.conf.in new file mode 100644 index 0000000..14f0eae --- /dev/null +++ b/src/core/user.conf.in @@ -0,0 +1,59 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/user.conf.d/ directory. The latter is generally recommended. +# Defaults can be restored by simply deleting the main configuration file and +# all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/user.conf' to display the full config. +# +# See systemd-user.conf(5) for details. + +[Manager] +#LogLevel=info +#LogTarget=auto +#LogColor=yes +#LogLocation=no +#LogTime=no +#SystemCallArchitectures= +#TimerSlackNSec= +#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}} +#DefaultTimerAccuracySec=1min +#DefaultStandardOutput=inherit +#DefaultStandardError=inherit +#DefaultTimeoutStartSec={{DEFAULT_USER_TIMEOUT_SEC}}s +#DefaultTimeoutStopSec={{DEFAULT_USER_TIMEOUT_SEC}}s +#DefaultTimeoutAbortSec= +#DefaultDeviceTimeoutSec={{DEFAULT_USER_TIMEOUT_SEC}}s +#DefaultRestartSec=100ms +#DefaultStartLimitIntervalSec=10s +#DefaultStartLimitBurst=5 +#DefaultEnvironment= +#DefaultLimitCPU= +#DefaultLimitFSIZE= +#DefaultLimitDATA= +#DefaultLimitSTACK= +#DefaultLimitCORE= +#DefaultLimitRSS= +#DefaultLimitNOFILE= +#DefaultLimitAS= +#DefaultLimitNPROC= +#DefaultLimitMEMLOCK= +#DefaultLimitLOCKS= +#DefaultLimitSIGPENDING= +#DefaultLimitMSGQUEUE= +#DefaultLimitNICE= +#DefaultLimitRTPRIO= +#DefaultLimitRTTIME= +#DefaultMemoryPressureThresholdSec=200ms +#DefaultMemoryPressureWatch=auto +#DefaultSmackProcessLabel= +#ReloadLimitIntervalSec= +#ReloadLimitBurst -- cgit v1.2.3